shithub: openh264

Download patch

ref: a913cc853e517c2a5a0f79cc72cd5df590d82317
parent: 90e0057ba6b46df54897bda88869665c7dd08fe1
parent: f9dea467123fbff2c74422a8634b20af4026de49
author: Ethan Hugg <ethanhugg@gmail.com>
date: Fri Dec 13 03:54:14 EST 2013

Merge pull request #32 from mstorsjo/cosmetics

Consistently use unix newlines, remove trailing whitespace

--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 OpenH264
 =======
 OpenH264 is a codec library which supports H.264 encoding and decoding. It is suitable for use in real time applications such as WebRTC. See http://www.openh264.org/ for more details.
- 
+
 Encoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -17,10 +17,10 @@
 - Single reference frame for inter prediction
 - Multiple reference frames when using LTR and/or 3-4 temporal layers
 - Periodic and on-demand Instantaneous Decoder Refresh (IDR) frame insertion
-- Dynamic changes to bit rate, frame rate, and resolution 
+- Dynamic changes to bit rate, frame rate, and resolution
 - Annex B byte stream output
 - YUV 4:2:0 planar input
- 
+
 Decoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -32,7 +32,7 @@
 - Multiple reference frames when specified in Sequence Parameter Set (SPS)
 - Annex B byte stream input
 - YUV 4:2:0 planar output
- 
+
 OS Support
 ----------------
 - Windows 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
@@ -40,7 +40,7 @@
 - Linux 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
 - Android 32-bit (initial release does not include this target, will follow soon)
 - iOS 64-bit and 32-bit (not supported yet, may be added in the future)
- 
+
 Processor Support
 -------------------------
 - Intel x86 optionally with MMX/SSE (no AVX yet, help is welcome)
@@ -53,30 +53,30 @@
     : build the decoder library and executable via codec/build/linux/dec/makefile
     : build the encoder library and executable via codec/build/linux/enc/makefile
     : build the encoder shared library via processing/build/linux/makefile
- 
+
 Windows Visual Studio 2008/2010/2012 projects are available:
     : build the decoder via the Visual Studio projects in codec/build/win32/dec
     : build the encoder via the Visual Studio projects in codec/build/win32/enc
     : build the encoder shared library via the Visual Studio projects in processing/build/win32/
- 
+
 NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
- 
+
 API details to be provided later.
- 
+
 Using the Test App
 -------------------------
 Linux shell scripts to build the test apps:
     : build via testbin/AutoBuild_Linux.sh
     : clean via testbin/AutoClean_Linux.sh
- 
+
 Windows batch files to build the test apps:
     : Visual Studio 2008 use testbin/AutoBuild_Windows_VS2008.bat
     : Visual Studio 2010 use testbin/AutoBuild_Windows_VS2010.bat
     : Visual Studio 2012 use testbin/AutoBuild_Windows_VS2012.bat
- 
+
 Usage information can be found in testbin/CmdLineReadMe
 Command line options and details to be provided later.
- 
+
 Using the Source
 -----------------------
 codec - encoder, decoder, console (test app), build (makefile, vcproj)
@@ -83,7 +83,7 @@
 processing - raw pixel processing (used by encoder)
 testbin - autobuild scripts, test app config files, yuv test files
 bin - binaries for library and test app
- 
+
 Known Issues
 -------------------
 See the issue tracker on https://github.com/cisco/openh264/issues
@@ -91,7 +91,7 @@
 - Encoder errors when compressed frame size exceeds half uncompressed size
 - Encoder console app only support multiple of 16 width/height for now
 - Decoder errors when compressed frame size exceeds 1MB
- 
+
 License
 ----------
 BSD, see LICENSE file for details.
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -19,7 +19,7 @@
 def write_cpp_rule(f, x):
     src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
     dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
     f.write("%s: %s\n"%(dst, src))
     f.write('\t$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(' + PREFIX + '_CFLAGS) $(' + PREFIX + '_INCLUDES) -c -o ' + dst + ' ' + src + '\n');
     f.write("\n")
@@ -27,7 +27,7 @@
 def write_asm_rule(f, x):
     src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
     dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
     f.write("%s: %s\n"%(dst, src))
     f.write('\t$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(' + PREFIX + '_ASMFLAGS) $(' + PREFIX + '_ASM_INCLUDES) -o ' + dst + ' ' + src + '\n');
     f.write("\n")
@@ -70,7 +70,7 @@
 f.write("%s_CPP_SRCS=\\\n"%(PREFIX))
 for c in cpp:
     f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
-f.write("\n")    
+f.write("\n")
 f.write("%s_OBJS += $(%s_CPP_SRCS:.cpp=.o)\n"%(PREFIX, PREFIX))
 
 f.write("ifeq ($(USE_ASM), Yes)\n");
--- a/codec/build/linux/dec/makefile
+++ b/codec/build/linux/dec/makefile
@@ -25,7 +25,7 @@
 ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/
 
 LIBS= -lstdc++ -ldl
-#-lm 
+#-lm
 CFLAGS=  $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN
 
 ifeq ($(DBG),1)
@@ -65,7 +65,7 @@
 $(CORESRCDIR)/utils.cpp \
 $(PLUSSRCDIR)/welsDecoderExt.cpp \
 $(PLUSSRCDIR)/welsCodecTrace.cpp \
-$(COMMONSRCDIR)/logging.cpp 
+$(COMMONSRCDIR)/logging.cpp
 
 ASMSRC= $(ASMSRCDIR)/block_add.asm \
 $(ASMSRCDIR)/cpuid.asm \
@@ -78,7 +78,7 @@
 $(ASMSRCDIR)/mc_luma.asm \
 $(ASMSRCDIR)/memzero.asm \
 $(ASMSRCDIR)/asm_inc.asm \
- 
+
 MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
 $(MAINSRCDIR)/h264dec.cpp \
 $(MAINSRCDIR)/read_config.cpp
@@ -119,7 +119,7 @@
 $(OBJDIR)/mb_copy.o \
 $(OBJDIR)/mc_luma.o \
 $(OBJDIR)/memzero.o \
-$(OBJDIR)/asm_inc.o 
+$(OBJDIR)/asm_inc.o
 endif
 
 OBJBIN=	$(OBJDIR)/d3d9_utils.o \
@@ -134,7 +134,7 @@
 
 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(BINDIR) ; \
@@ -154,7 +154,7 @@
 		mkdir -p $(OBJDIR) ; \
 	fi
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -169,14 +169,14 @@
 	@rm -f $(OBJBIN)
 	@rm -f $(BINLIB)
 	@rm -f $(SHAREDLIB)
-	@rm -f $(BIN)    
+	@rm -f $(BIN)
 
 tags:
 	@echo update tag table
 	@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJDEC) 
+
+
+lib:   	$(OBJDEC)
 	@echo '$(OBJDEC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJDEC)'
@@ -197,15 +197,15 @@
 	@$(CXX)  -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
 
+
 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
 	@echo 'creating binary "$(BIN)"'
-	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS) 
+	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
 	@echo '... done'
 	@echo
 
@@ -223,31 +223,31 @@
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-		
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
+	@$(AS) $(ASFLAGS) -o $@ $<
 
 #$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
 #	@echo 'compiling object file "$@" ...'
 #	@$(AS) $(ASFLAGS) -o $@ $<
-	
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-	
+
 include $(DEPEND)
 
--- a/codec/build/linux/enc/makefile
+++ b/codec/build/linux/enc/makefile
@@ -26,8 +26,8 @@
 ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/
 
 LIBS= -lstdc++ -ldl -lpthread -lm
-#-lm 
-CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED 
+#-lm
+CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
 
 ifeq ($(DBG),1)
 #SUFFIX= .dbg
@@ -150,7 +150,7 @@
 $(OBJDIR)/satd_sad.o \
 $(OBJDIR)/score.o \
 $(OBJDIR)/asm_inc.o \
-$(OBJDIR)/vaa.o 
+$(OBJDIR)/vaa.o
 endif
 OBJBIN=	$(OBJDIR)/read_config.o \
 $(OBJDIR)/welsenc.o
@@ -163,7 +163,7 @@
 
 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(OUTDIR) ; \
@@ -195,9 +195,9 @@
 tags:
 	@echo update tag table
 	@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJENC) 
+
+
+lib:   	$(OBJENC)
 	@echo '$(OBJENC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJENC)'
@@ -218,7 +218,7 @@
 	@$(GCC)  -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -228,7 +228,7 @@
 	@echo
 
 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
@@ -251,24 +251,24 @@
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
-	
+	@$(AS) $(ASFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
--- a/codec/decoder/core/asm/asm_inc.asm
+++ b/codec/decoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -48,7 +48,7 @@
 ; Macros and other preprocessor constants
 ;*******************************************************************************
 
-%macro   BLOCK_ADD_16_SSE2   4 
+%macro   BLOCK_ADD_16_SSE2   4
 	movdqa    xmm0,       [%2]
 	movdqa    xmm1,       [%3]
     movdqa    xmm2,       [%3+10h]
@@ -65,7 +65,7 @@
 
 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%4*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro
 
 %macro    BLOCK_ADD_8_MMXEXT   4
@@ -106,7 +106,7 @@
 
 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%5*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro
 
 
@@ -130,24 +130,24 @@
 	lea          %1,      [%1+%4]
 %endmacro
 
-%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5    
+%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5
 	movdqa xmm1, [%3]
 	movq xmm0, [%2]
 	punpcklbw xmm0, xmm7
 	paddw xmm0, xmm1
 	packuswb xmm0, xmm7
-	movq [%1], xmm0	
-	
+	movq [%1], xmm0
+
 	movdqa xmm3, [%3+%5*2]
 	movq xmm2, [%2+%4]
 	punpcklbw xmm2, xmm7
 	paddw xmm2, xmm3
-	packuswb xmm2, xmm7	
-	movq [%1+%4], xmm2	
-	
+	packuswb xmm2, xmm7
+	movq [%1+%4], xmm2
+
 	lea %1, [%1+%4*2]
 	lea %2, [%2+%4*2]
-	lea %3, [%3+%5*4]	
+	lea %3, [%3+%5*4]
 %endmacro
 
 %macro   CHECK_DATA_16_ZERO_SSE4     3
@@ -159,7 +159,7 @@
 	por		   xmm0,	 xmm1
 	ptest      xmm7,     xmm0
 	cmovae     eax,      %3
-	
+
 	add        %1,       20h
 	add        ecx,      04h
 	mov        byte [%2+ebx],  al
@@ -170,12 +170,12 @@
     movdqa     xmm1,      [%1+%3]
     movdqa     xmm2,      [%1+%3*2]
     movdqa     xmm3,      [%1+%4]
-    
+
     mov        eax,       0h
     mov        ebx,       0h
     movdqa     xmm4,      xmm0
     movdqa     xmm5,      xmm2
-    
+
     punpcklqdq  xmm0,     xmm1
     punpckhqdq  xmm4,     xmm1
     punpcklqdq  xmm2,     xmm3
@@ -183,12 +183,12 @@
 
 	por			xmm0,	  xmm2
 	por			xmm4,	  xmm5
-    
+
     ptest       xmm7,     xmm0
     cmovae      eax,      %5
     ptest       xmm7,     xmm4
-    cmovae      ebx,      %5    
-    
+    cmovae      ebx,      %5
+
     mov     byte [%2],    al
     mov     byte [%2+1],  bl
 %endmacro
@@ -230,45 +230,45 @@
     movdqa     xmm0,      [%1]
     movdqa     xmm1,      [%1+10h]
     mov        ebx,       [ecx]
-    
+
     pcmpeqw    xmm0,      xmm7
     pcmpeqw    xmm1,      xmm7
     packsswb   xmm0,      xmm1
-    pmovmskb   edx,       xmm0    
+    pmovmskb   edx,       xmm0
     sub        edx,       0ffffh
-    
-    cmovb      eax,       ebp   
+
+    cmovb      eax,       ebp
     add        ecx,       4
     add        %1,        20h
     mov      byte [%2+ebx],    al
 %endmacro
-    
 
 
+
 %macro   CHECK_RS_4x4_BLOCK_2_ZERO_SSE2    5
     movdqa    xmm0,      [%1]
     movdqa    xmm1,      [%1 + %3]
     movdqa    xmm2,      [%1 + %3*2]
-    movdqa    xmm3,      [%1 + %4]    
-    
+    movdqa    xmm3,      [%1 + %4]
+
     movdqa    xmm4,       xmm0
     movdqa    xmm5,       xmm2
-    
+
     punpcklqdq   xmm0,    xmm1
     punpckhqdq   xmm4,    xmm1
     punpcklqdq   xmm2,    xmm3
     punpckhqdq   xmm5,    xmm3
-    
+
     pcmpeqw      xmm0,    xmm7
     pcmpeqw      xmm2,    xmm7
     pcmpeqw      xmm4,    xmm7
     pcmpeqw      xmm5,    xmm7
-    
+
     packsswb     xmm0,    xmm2
     packsswb     xmm4,    xmm5
     pmovmskb     eax,     xmm0
     pmovmskb     ebx,     xmm4
-    
+
     sub          eax,     0ffffh
     mov          eax,     0
     cmovb        eax,     %5
@@ -276,7 +276,7 @@
     mov          ebx,     0
     cmovb        ebx,     %5
     mov       byte [%2],    al
-    mov       byte [%2+1],  bl        
+    mov       byte [%2+1],  bl
 %endmacro
 
 ;*******************************************************************************
@@ -291,12 +291,12 @@
 
 ALIGN  16
 SubMbScanIdx:
-     dd    0x0,  0x1,  0x4,  0x5, 
+     dd    0x0,  0x1,  0x4,  0x5,
 	 dd    0x2,  0x3,  0x6,  0x7,
 	 dd    0x8,  0x9,  0xc,  0xd,
 	 dd    0xa,  0xb,  0xe,  0xf,
 	 dd    0x10, 0x11, 0x14, 0x15,
-	 dd    0x12, 0x13, 0x16, 0x17,     
+	 dd    0x12, 0x13, 0x16, 0x17,
 
 ;*******************************************************************************
 ; Code
@@ -312,10 +312,10 @@
 ;  void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
 ;*******************************************************************************
 WelsResBlockZero16x16_sse2:
-    push     esi	
+    push     esi
 
 	mov      esi,        [esp+08h]
-	mov      ecx,        [esp+0ch]	
+	mov      ecx,        [esp+0ch]
 	lea      ecx,        [ecx*2]
 	lea      eax,        [ecx*3]
 
@@ -375,7 +375,7 @@
 
 	movdqa   [esi+eax],     xmm7
 	movdqa   [esi+eax+10h],     xmm7
-    
+
     pop      esi
 	ret
 
@@ -386,7 +386,7 @@
 ;*******************************************************************************
 ;  void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
 ;*******************************************************************************
-WelsResBlockZero8x8_sse2: 
+WelsResBlockZero8x8_sse2:
 	  push      esi
 
       mov       esi,     [esp+08h]
@@ -407,7 +407,7 @@
 	  movdqa    [esi+ecx*2],   xmm7
 	  movdqa    [esi+eax],     xmm7
 
-	  
+
 	  pop       esi
 	  ret
 
--- a/codec/decoder/core/asm/cpuid.asm
+++ b/codec/decoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -1,129 +1,129 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        ?Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        ?Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  dct.asm
-;*
-;*  Abstract
-;*      WelsDctFourT4_sse2
-;*
-;*  History
-;*      8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
-    movq    %3, %2
-    psraw   %3, $1
-    paddw   %3, %1
-    psraw   %1, $1
-    psubw   %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
-	movq    %3, %2
-    psubw   %2, %1
-    paddw   %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
-    MMX_SumSub      %4, %5, %6
-    MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
-    movd       %2, %5
-    punpcklbw  %2, %4
-    paddw      %1, %3
-    psraw      %1, $6
-    paddsw     %1, %2
-    packuswb   %1, %2
-    movd       %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
-;*******************************************************************************
-;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-IdctResAddPred_mmx:
-
-%define	pushsize	0
-%define pPred       esp+pushsize+4
-%define kiStride     esp+pushsize+8
-%define pRs         esp+pushsize+12
-
-	mov     eax, [pRs   ] 
-    mov     edx, [pPred ]   
-    mov     ecx, [kiStride]   
-    movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
-
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
-
-    WELS_Zero			mm7
-    WELS_DW32			mm6
-    
-    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
-    lea     edx, [edx+2*ecx]
-    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
-    
-%undef	pushsize
-%undef  pPred
-%undef  kiStride
-%undef  pRs
-	emms
-    ret
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $1
+    paddw   %3, %1
+    psraw   %1, $1
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+    movd       %2, %5
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $6
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define	pushsize	0
+%define pPred       esp+pushsize+4
+%define kiStride     esp+pushsize+8
+%define pRs         esp+pushsize+12
+
+	mov     eax, [pRs   ]
+    mov     edx, [pPred ]
+    mov     ecx, [kiStride]
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
+    lea     edx, [edx+2*ecx]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
+
+%undef	pushsize
+%undef  pPred
+%undef  kiStride
+%undef  pRs
+	emms
+    ret
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-  
-DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-  
-WELS_EXTERN  DeblockChromaLt4H_sse2
-  
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-  
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-  
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-  
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-  
-    
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-    
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-    
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-    
-    
-    
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-    
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-      
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-    
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
-    mov      esp,   ebp
-    pop      ebp
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+
+DeblockChromaEq4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4H_sse2
+
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h
+
+    mov     eax,   [ebp + 0Ch]
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+
+    movq    xmm0,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0
+
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+
+
+
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h
+
+    mov      eax,   [ebp + 10h]
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    lea      eax,   [ecx * 3]
+
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+
+    mov      esp,   ebp
+    pop      ebp
     ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ b/codec/decoder/core/asm/expand_picture.asm
@@ -155,11 +155,11 @@
 	lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@@ -173,7 +173,7 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@@ -184,15 +184,15 @@
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@@ -202,21 +202,21 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops
 
@@ -243,13 +243,13 @@
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
@@ -256,37 +256,37 @@
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@@ -339,25 +339,25 @@
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 
 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
 
 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 
 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@@ -375,7 +375,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -387,10 +387,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -398,16 +398,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -419,7 +419,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -426,7 +426,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -436,7 +436,7 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@@ -444,19 +444,19 @@
 	mov ecx, [esp+28]					; kiStride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -kiStride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -472,7 +472,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -484,10 +484,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -495,16 +495,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; pDst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; pDst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst: left border pSrc
@@ -516,7 +516,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -523,7 +523,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -533,9 +533,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -545,15 +545,15 @@
 	neg ecx										; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -569,7 +569,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -581,10 +581,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -592,16 +592,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -613,7 +613,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -620,7 +620,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -630,9 +630,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -kiStride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; kiHeight+16, luma=32, chroma=16
@@ -642,14 +642,14 @@
 	neg ecx									; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -38,7 +38,7 @@
 ;*      18/09/2009 Created
 ;*		19/11/2010 Added
 ;*					WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
-;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2 
+;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
 ;*					and WelsIChromaPredDcNA_mmx
 ;*
 ;*
@@ -96,13 +96,13 @@
 	punpcklbw	%1,	%3
 	movdqa		%3,	%1
 	punpcklbw	%1,	%3
-	
+
 	;add			%4,	%5
 	movd		%2,	[%4+%5-1]
 	movdqa		%3,	%2
 	punpcklbw	%2,	%3
 	movdqa		%3,	%2
-	punpcklbw	%2,	%3	
+	punpcklbw	%2,	%3
 	punpckldq	%1,	%2
 %endmacro
 
@@ -116,24 +116,24 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%4,	[%5]
 		movd	%2,	[%5+%6]
 		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%3,	[%5]
 		movd	%2,	[%5+%6]
 		lea		%5,	[%5+2*%6]
 		punpcklbw %3,	%2
 		punpcklwd %4,	%3
-		punpckhdq %1,	%4	
-%endmacro	
+		punpckhdq %1,	%4
+%endmacro
 
 %macro  SUMW_HORIZON 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -162,7 +162,7 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]			
+		lea		%5,	[%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
@@ -186,7 +186,7 @@
 ALIGN 16
 ;*******************************************************************************
 ;   void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;	pPred must align to 16
 ;*******************************************************************************
 WelsI4x4LumaPredH_sse2:
@@ -196,7 +196,7 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm0,	edx
 	pmuludq		xmm0,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm1,	edx
 	pmuludq		xmm1,	[mmx_01bytes]
@@ -205,11 +205,11 @@
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm2,	edx
 	pmuludq		xmm2,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+2*ecx-1]
-	movd		xmm3,	edx	
+	movd		xmm3,	edx
 	pmuludq		xmm3,	[mmx_01bytes]
-	
+
 	sub         eax,    ecx
 	movd        [eax], xmm0
 	movd        [eax+ecx], xmm1
@@ -216,9 +216,9 @@
 	lea         eax, [eax+2*ecx]
 	movd        [eax], xmm2
 	movd        [eax+ecx], xmm3
-	
+
 	ret
-	
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -229,9 +229,9 @@
 		mov		ecx,	[esp + pushsize + 8]
 		sub		esi,	1
 		sub		esi,	ecx
-		
+
 		;for H
-		pxor	xmm7,	xmm7	
+		pxor	xmm7,	xmm7
 		movq	xmm0,	[esi]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
@@ -241,7 +241,7 @@
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
 		psubw	xmm1,	xmm0
-		
+
 		SUMW_HORIZON	xmm1,xmm0,xmm2
 		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
 		movsx	eax,	ax
@@ -249,26 +249,26 @@
 		add		eax,	32
 		sar		eax,	6			; b = (5 * H + 32) >> 6;
 		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
-		
-		movzx	edx,	BYTE [esi+16]	
+
+		movzx	edx,	BYTE [esi+16]
 		sub	esi, 3
 		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
-			
+
 		add		esi,	3
 		movzx	eax,	BYTE [esi+8*ecx]
 		add		edx,	eax
 		shl		edx,	4			;	a = (left[15*kiStride] + top[15]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
-		pxor	xmm4,	xmm4	
+		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
 		punpckhbw xmm7,	xmm4
 		pmullw	xmm7,	xmm6
 		psubw	xmm7,	xmm0
-		
+
 		SUMW_HORIZON   xmm7,xmm0,xmm2
 		movd    eax,   xmm7			; V
 		movsx	eax,	ax
@@ -276,17 +276,17 @@
 		imul	eax,	5
 		add		eax,	32
 		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_inc_minus]
-		
+
 get_i16x16_luma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -295,7 +295,7 @@
 		movdqa	xmm3,	xmm1
 		pmullw	xmm3,	xmm6
 		paddw	xmm3,	xmm0
-		psraw	xmm3,	5	
+		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
 		movdqa	[esi],	xmm2
 		paddw	xmm0,	xmm4
@@ -302,13 +302,13 @@
 		add		esi,	ecx
 		inc		eax
 		cmp		eax,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1					
-		
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
 		pop		esi
 		ret
-		
-		
-		
+
+
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -315,7 +315,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [eax],	xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
@@ -326,13 +326,12 @@
 WelsI16x16LumaPredH_sse2:
     mov     eax, [esp+4]    ; pPred
     mov     ecx, [esp+8]    ; kiStride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [eax],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [eax+ecx],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE_DEC 
+
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
@@ -339,9 +338,10 @@
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
-   
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+
     ret
-    
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -349,10 +349,10 @@
 WelsI16x16LumaPredV_sse2:
     mov     edx, [esp+4]    ; pPred
     mov     ecx, [esp+8]    ; kiStride
-    
+
     sub     edx, ecx
     movdqa  xmm0, [edx]
-    
+
     movdqa  [edx+ecx], xmm0
     lea     edx, [edx+2*ecx]
     movdqa  [edx],     xmm0
@@ -377,9 +377,9 @@
     movdqa  [edx+ecx], xmm0
     lea     edx, [edx+2*ecx]
     movdqa  [edx],     xmm0
-        
+
     ret
-    
+
 ;*******************************************************************************
 ; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -391,8 +391,8 @@
 		mov		ecx,	[esp + pushsize + 8]	;kiStride
 		sub		esi,	1
 		sub		esi,	ecx
-		
-		pxor	mm7,	mm7	
+
+		pxor	mm7,	mm7
 		movq	mm0,	[esi]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
@@ -402,7 +402,7 @@
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
 		psubw	mm1,	mm0
-		
+
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
@@ -412,7 +412,7 @@
 		add		eax,	16
 		sar		eax,	5			; b = (17 * H + 16) >> 5;
 		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
-		
+
 		movzx	edx,	BYTE [esi+8]
 		sub	esi, 3
 		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
@@ -421,17 +421,17 @@
 		movzx	eax,	BYTE [esi+4*ecx]
 		add		edx,	eax
 		shl		edx,	4			; a = (left[7*kiStride] + top[7]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
-		pxor	mm4,	mm4	
+		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
 		punpckhbw mm7,	mm4
 		pmullw	mm7,	mm6
 		psubw	mm7,	mm0
-		
+
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
@@ -441,17 +441,17 @@
 		imul	eax,	17
 		add		eax,	16
 		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
-		
+
 get_i_chroma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -463,12 +463,12 @@
 		add		esi,	ecx
 		inc		eax
 		cmp		eax,	8
-		jnz get_i_chroma_pred_plane_sse2_1					
-		
+		jnz get_i_chroma_pred_plane_sse2_1
+
 		pop		esi
 		WELSEMMS
-		ret	
-		
+		ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -480,13 +480,13 @@
 ;	pPred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;*******************************************************************************
-WelsI4x4LumaPredDDR_mmx:	
+WelsI4x4LumaPredDDR_mmx:
 	mov			edx,[esp+4]			;pPred
 	mov         eax,edx
 	mov			ecx,[esp+8]		;kiStride
-	
+
 	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
 	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
@@ -513,19 +513,19 @@
 	pand        mm1,[mmx_01bytes]	;set the odd bit
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-	
+
 	lea         edx,[edx+ecx]
-	movd        [edx+2*ecx],mm2 
+	movd        [edx+2*ecx],mm2
 	sub         edx,ecx
-	psrlq       mm2,8 
-	movd        [edx+2*ecx],mm2 
-	psrlq       mm2,8 
-	movd        [edx+ecx],mm2 
-	psrlq       mm2,8 
+	psrlq       mm2,8
+	movd        [edx+2*ecx],mm2
+	psrlq       mm2,8
+	movd        [edx+ecx],mm2
+	psrlq       mm2,8
 	movd        [edx],mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -537,36 +537,36 @@
 ;	pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;*******************************************************************************
-WelsI4x4LumaPredDc_sse2:	
+WelsI4x4LumaPredDc_sse2:
 	mov         eax,[esp+4]			;pPred
 	mov			ecx,[esp+8]			;kiStride
 	push		ebx
-		
+
 	movzx		edx,	byte [eax-1h]
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
-	
+
 	movd		ebx,	xmm0
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2-1h]
 	add			ebx,	edx
-	
+
 	lea			eax,	[eax+ecx*2-1]
 	movzx		edx,	byte [eax+ecx]
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2]
 	add			ebx,	edx
 	add			ebx,	4
 	sar			ebx,	3
 	imul		ebx,	0x01010101
-	
+
 	mov			edx,	[esp+8]			;pPred
 	mov         [edx],       ebx
 	mov         [edx+ecx],   ebx
@@ -575,8 +575,8 @@
 	mov         [edx+ecx],   ebx
 
 	pop ebx
-	ret	
-	
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -585,7 +585,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINE 4
 	movq		%1,		[%3-8]
 	psrlq		%1,		38h
-	
+
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
 	movq		[%4],	%1
@@ -594,7 +594,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
 	movq		%1,		[%3+ecx-8]
 	psrlq		%1,		38h
-	
+
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
 	movq		[%4],	%1
@@ -605,37 +605,37 @@
 	mov			edx,	[esp+4]			;pPred
 	mov         eax,	edx
 	mov			ecx,	[esp+8]			;kiStride
-	
+
 	movq		mm0,	[eax-8]
 	psrlq		mm0,	38h
-	
+
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
 	movq		[edx],	mm0
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
-	
+
 	lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
-	
+
 	lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
 
     lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-		
+
 	WELSEMMS
-	ret	
-	
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
@@ -645,7 +645,7 @@
 get_i4x4_luma_pred_v_asm:
 	mov			eax,	[esp+4]        ;pPred
 	mov			ecx,	[esp+8]        ;kiStride
-	
+
 	sub			eax,	ecx
 	mov         edx,    [eax]
 	mov		    [eax+ecx],	 edx
@@ -653,9 +653,9 @@
 	lea         eax, [eax+2*ecx]
 	mov			[eax+ecx],	 edx
 	mov			[eax+2*ecx], edx
-	
-	ret	
 
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -665,7 +665,7 @@
 WelsIChromaPredV_mmx:
 	mov			eax,		[esp+4]    ;pPred
 	mov			ecx,		[esp+8]    ;kiStride
-	
+
 	sub			eax,		ecx
 	movq		mm0,		[eax]
 
@@ -680,11 +680,11 @@
 	lea         eax, [eax+2*ecx]
 	movq		[eax+ecx],      mm0
 	movq		[eax+2*ecx],    mm0
-	
+
 	WELSEMMS
 	ret
-	
-	
+
+
 	ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -710,13 +710,13 @@
 
 ;   f = (2 + l1 + (l0<<1) + lt)>>2
 ;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   j = (2 + l3 + (l2<<1) + l1)>>2
 ;   [b a f e h g j i] + [d c b a] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:	
+WelsI4x4LumaPredHD_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -723,16 +723,16 @@
 	sub         eax, ecx
 	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]        
+	movd        mm2, [eax+2*ecx-4]
 	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-	
+
 	movq        mm1, mm0
 	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
 	movq        mm2, mm0
@@ -740,17 +740,17 @@
 	movq        mm3, mm2
 	movq        mm4, mm1
 	pavgb       mm1, mm0
-	
+
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-	
+
 	movq        mm4, mm0
 	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
 	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-	
+
 	psrlq       mm2, 20h
 	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
 	movq        mm4, mm3
@@ -757,7 +757,7 @@
 	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-	
+
 	movd        [edx], mm2
 	lea         edx, [edx+ecx]
 	movd        [edx+2*ecx], mm3
@@ -768,9 +768,9 @@
 	movd        [edx+ecx], mm3
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -793,17 +793,17 @@
 ;   b = (2 + l0 + (l1<<1) + l2)>>2
 ;   d = (2 + l1 + (l2<<1) + l3)>>2
 ;   f = (2 + l2 + (l3<<1) + l3)>>2
- 
+
 ;   [g g f e d c b a] + [g g g g] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:	
+WelsI4x4LumaPredHU_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
-	
+
 	movd        mm0, [eax-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         eax, [eax+2*ecx]
@@ -811,39 +811,39 @@
 	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-	
+
 	psrlq       mm4, 18h
 	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
 	psrlq       mm0, 8h
 	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
 	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
 	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
 	movq        mm5, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
 	pand        mm5, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm5				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-	
+
 	psrlq       mm2, 8h
 	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-	
+
 	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-	
+
 	psrlq       mm4, 20h
 	lea         edx, [edx+ecx]
 	movd        [edx+2*ecx], mm4
-	
+
 	sub         edx, ecx
 	movd        [edx], mm1
 	psrlq       mm1, 10h
@@ -852,9 +852,9 @@
 	movd        [edx+2*ecx], mm1
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -880,12 +880,12 @@
 
 ;   h = (2 + t1 + (t2<<1) + t3)>>2
 ;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2   
-;   
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
 ;   void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:	
+WelsI4x4LumaPredVR_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -892,51 +892,51 @@
 	sub         eax, ecx
 	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
 	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
 	movq        mm3, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm3				; decrease 1 from odd bytes
-	
+
 	movq        mm3, mm0
 	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
 	movq        mm2, mm3
-	
+
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
 	movd        [edx], mm1
-	
+
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
 	movd        [edx+ecx], mm2
-	
+
 	movq        mm4, mm3
 	psllq       mm4, 20h
 	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-	
+
 	movq        mm5, mm3
 	psllq       mm5, 28h
 	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-	
+
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
 	movd        [edx+2*ecx], mm4
-	
+
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
 	lea         edx, [edx+2*ecx]
@@ -943,7 +943,7 @@
 	movd        [edx+ecx], mm5
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -966,13 +966,13 @@
 ;   e = (2 + t4 + t6 + (t5<<1))>>2
 ;   f = (2 + t5 + t7 + (t6<<1))>>2
 ;   g = (2 + t6 + t7 + (t7<<1))>>2
- 
+
 ;   [g f e d c b a] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:	
+WelsI4x4LumaPredDDL_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -980,11 +980,11 @@
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	movq        mm3, mm0
 	psrlq       mm3, 38h
 	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-	
+
 	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
 	psrlq       mm2, 8h
 	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -994,9 +994,9 @@
 	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm3				; decrease 1 from odd bytes
-	
+
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-	
+
 	psrlq       mm0, 8h
 	movd        [edx], mm0
 	psrlq       mm0, 8h
@@ -1008,8 +1008,8 @@
 	movd        [edx+ecx], mm0
 	WELSEMMS
 	ret
-	
-	
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1035,40 +1035,40 @@
 ;   g = (2 + t2 + (t3<<1) + t4)>>2
 ;   h = (2 + t3 + (t4<<1) + t5)>>2
 ;   j = (2 + t4 + (t5<<1) + t6)>>2
- 
+
 ;   [i d c b a] + [j h g f e] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:	
+WelsI4x4LumaPredVL_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
 	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
 	movq        mm3, mm1
 	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-	
+
 	movq        mm4, mm2
-	pavgb       mm2, mm0	
+	pavgb       mm2, mm0
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-	
+
 	movd        [edx], mm3
 	psrlq       mm3, 8h
 	movd        [edx+2*ecx], mm3
-	
+
 	movd        [edx+ecx], mm2
 	psrlq       mm2, 8h
 	lea         edx, [edx+2*ecx]
@@ -1075,7 +1075,7 @@
 	movd        [edx+ecx], mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;
@@ -1082,11 +1082,11 @@
 ;   void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:	
+WelsIChromaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+8]			; pPred
 	mov			ecx, [esp+12]           ; kiStride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]
 
@@ -1100,7 +1100,7 @@
 	movzx		edx, byte [eax-0x01]     ; l4
 	add			ebx, edx
 	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
-	
+
 	movzx		ebx, byte [eax+ecx-0x01] ; l5
 	lea         eax, [eax+2*ecx]
 	movzx		edx, byte [eax-0x01]     ; l6
@@ -1111,7 +1111,7 @@
 	movzx		edx, byte [eax-0x01]     ; l8
 	add			ebx, edx
 	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-	
+
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1118,46 +1118,46 @@
 	psrlq       mm3, 0x20
 	pxor		mm4, mm4
 	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
-	
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
 	paddq       mm3, mm1
 	movq        mm1, mm2
 	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-	
+
 	movq        mm4, [mmx_0x02]
-	
+
 	paddq       mm0, mm4
 	psrlq       mm0, 0x02
-	
+
 	paddq       mm2, mm4
 	psrlq       mm2, 0x02
-	
+
 	paddq       mm3, mm4
 	paddq       mm3, mm4
 	psrlq       mm3, 0x03
-	
+
 	paddq       mm1, mm4
 	paddq       mm1, mm4
 	psrlq       mm1, 0x03
-	
+
 	pmuludq     mm0, [mmx_01bytes]
 	pmuludq     mm3, [mmx_01bytes]
 	psllq       mm0, 0x20
 	pxor        mm0, mm3                 ; mm0 = m_up
-	
+
 	pmuludq     mm2, [mmx_01bytes]
 	pmuludq     mm1, [mmx_01bytes]
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
-	
+
 	mov         edx, [esp+8]			 ; pPred
-	
+
 	movq        [edx],       mm0
 	movq        [edx+ecx],   mm0
 	movq        [edx+2*ecx], mm0
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm0
-	
+
 	movq        [edx+2*ecx], mm1
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm1
@@ -1164,13 +1164,13 @@
 	movq        [edx+2*ecx], mm1
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm1
-	
+
 	pop         ebx
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;
@@ -1177,11 +1177,11 @@
 ;   void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:	
+WelsI16x16LumaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+8]			; pPred
 	mov			ecx, [esp+12]           ; kiStride
-	
+
 	sub         eax, ecx
 	movdqa      xmm0, [eax]             ; read one row
 	pxor		xmm1, xmm1
@@ -1191,7 +1191,7 @@
 	pslldq      xmm0, 0x08
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
-	
+
 	movzx		ebx, byte [eax+ecx-0x01]
 	movzx		edx, byte [eax+2*ecx-0x01]
 	add			ebx, edx
@@ -1209,44 +1209,44 @@
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
-	
+
 	mov         edx, [esp+8]			; pPred
-	
+
 	movdqa      [edx],       xmm0
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 
 	pop         ebx
 
 	ret
-	
+
 ;*******************************************************************************
 ; for intra prediction as follows, 11/19/2010
 ;*******************************************************************************
@@ -1258,12 +1258,12 @@
 WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
 WelsI16x16LumaPredDcTop_sse2:
 	push ebx
-	
+
 	%define PUSH_SIZE 4
-	
+
 	mov eax, [esp+PUSH_SIZE+4]	; pPred
 	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
-	
+
 	mov ecx, ebx
 	neg ecx
 	movdqa xmm0, [eax+ecx]		; pPred-kiStride, top line
@@ -1278,10 +1278,10 @@
 	pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
 	paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
 	pshuflw xmm1, xmm0, 0b1h	; 10110001
-	paddw xmm0, xmm1			; sum in word unit (x8)	
+	paddw xmm0, xmm1			; sum in word unit (x8)
 	movd edx, xmm0
 	and edx, 0ffffh
-	
+
 	add edx, 08h
 	sar edx, 04h
 	mov dh, dl
@@ -1288,35 +1288,35 @@
 	mov ecx, edx
 	shl ecx, 010h
 	or edx, ecx
-	movd xmm1, edx	
+	movd xmm1, edx
 	pshufd xmm0, xmm1, 00h
 	movdqa xmm1, xmm0
-	
+
 	lea ecx, [2*ebx+ebx]		; 3*kiStride
-	
+
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	%undef PUSH_SIZE
 	pop ebx
 	ret
@@ -1328,41 +1328,41 @@
 WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
 WelsI16x16LumaPredDcNA_sse2:
 	push ebx
-	
+
 	%define PUSH_SIZE	4
-	
+
 	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ebx, [esp+PUSH_SIZE+8]	; kiStride	
-	
+	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+
 	lea ecx, [2*ebx+ebx]		; 3*kiStride
-	
+
 	movdqa xmm0, [sse2_dc_0x80]
-	movdqa xmm1, xmm0	
+	movdqa xmm1, xmm0
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	%undef PUSH_SIZE
-	
+
 	pop ebx
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1370,12 +1370,12 @@
 WELS_EXTERN WelsIChromaPredDcLeft_mmx
 WelsIChromaPredDcLeft_mmx:
 	push ebx
-	push esi	
+	push esi
 	%define PUSH_SIZE 8
 	mov esi, [esp+PUSH_SIZE+4]	; pPred
 	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
 	mov eax, esi
-	; for left	
+	; for left
 	dec eax
 	xor ebx, ebx
 	xor edx, edx
@@ -1384,7 +1384,7 @@
 	add ebx, edx
 	lea eax, [eax+2*ecx]
 	mov dl, [eax]
-	add ebx, edx	
+	add ebx, edx
 	mov dl, [eax+ecx]
 	add ebx, edx
 	add ebx, 02h
@@ -1451,7 +1451,7 @@
 	movdqa xmm6, [sse2_wd_0x02]
 	paddw xmm0, xmm6
 	psraw xmm0, 02h
-	packuswb xmm0, xmm7	
+	packuswb xmm0, xmm7
 	lea ebx, [2*ecx+ecx]
 	movq [eax], xmm0
 	movq [eax+ecx], xmm0
@@ -1463,10 +1463,10 @@
 	movq [eax+2*ecx], xmm0
 	movq [eax+ebx], xmm0
 	%undef PUSH_SIZE
-	pop ebx	
+	pop ebx
 	ret
 
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1495,4 +1495,4 @@
 	ret
 
 
-	
+
--- a/codec/decoder/core/asm/mb_copy.asm
+++ b/codec/decoder/core/asm/mb_copy.asm
@@ -37,7 +37,7 @@
 ;*  History
 ;*      15/09/2009 Created
 ;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, 
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
 ;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
 ;*
 ;*
@@ -84,7 +84,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq4_mmx:
-   
+
     push        esi
     push        edi
     push        ebp
@@ -102,7 +102,7 @@
 	movd        mm0, [ebp]
     pavgb       mm0, [esi]
     movd        [edi], mm0
-   
+
     dec         ebx
     lea         edi, [edi+eax]
     lea         esi, [esi+ecx]
@@ -115,7 +115,7 @@
     pop         edi
     pop         esi
     ret
-                          
+
 ALIGN 16
 ;*******************************************************************************
 ; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
@@ -124,7 +124,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq8_mmx:
-    
+
     push        esi
     push        edi
     push        ebp
@@ -145,14 +145,14 @@
     movq        mm0, [esi+ecx]
     pavgb       mm0, [ebp+edx]
     movq		[edi+eax], mm0
-    
+
     lea			esi,  [esi+2*ecx]
     lea			ebp, [ebp+2*edx]
     lea			edi,  [edi+2*eax]
-    
+
     sub           ebx, 2
     jnz         .height_loop
-	
+
 	WELSEMMS
     pop         ebx
     pop         ebp
@@ -174,8 +174,8 @@
     push        edi
     push        ebp
     push        ebx
-    
 
+
     mov         edi, [esp+20]       ; pDst
     mov         eax, [esp+24]       ; iDstStride
     mov         esi, [esp+28]       ; pSrcA
@@ -188,28 +188,28 @@
 	movdqu      xmm0, [esi]
 	pavgb         xmm0, [ebp]
     movdqu      [edi], xmm0
-    
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
     movdqu      [edi+eax], xmm0
-	
+
 	movdqu      xmm0, [esi+2*ecx]
 	pavgb         xmm0, [ebp+2*edx]
     movdqu      [edi+2*eax], xmm0
-    
+
     lea              esi,  [esi+2*ecx]
     lea			   ebp, [ebp+2*edx]
     lea			   edi,  [edi+2*eax]
-     
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
     movdqu      [edi+eax], xmm0
-    
+
     lea              esi,  [esi+2*ecx]
     lea			   ebp, [ebp+2*edx]
     lea			   edi,  [edi+2*eax]
-	    
-    
+
+
     sub         ebx, 4
     jne         .height_loop
 
@@ -232,7 +232,7 @@
     push    edi
     push    ebx
 
-    
+
     mov esi,  [esp+16]
     mov eax, [esp+20]
     mov edi,  [esp+24]
@@ -242,12 +242,12 @@
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
     pop     edi
     pop     esi
@@ -275,12 +275,11 @@
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
     pop     edi
     pop     esi
     ret
-	
 
 
 
@@ -288,6 +287,7 @@
 
 
 
+
 ALIGN 16
 ;*******************************************************************************
 ;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
@@ -308,7 +308,7 @@
     push    edi
 
     mov     esi, [esp+12]       ; pSrc
-    mov     eax, [esp+16]       ; iSrcStride    
+    mov     eax, [esp+16]       ; iSrcStride
     mov     edi, [esp+20]       ; pDst
     mov     edx, [esp+24]       ; iDstStride
     mov     ecx, [esp+28]       ; iHeight
@@ -324,7 +324,7 @@
     lea     esi, [esi+eax*2]
     lea     edi, [edi+edx*2]
     jnz     .height_loop
-  
+
     pop     edi
     pop     esi
     ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0
+
+	movq mm0, mm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	esi, [esi+2*edi]
+
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0
+
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4
+
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/decoder/core/asm/mc_luma.asm
+++ b/codec/decoder/core/asm/mc_luma.asm
@@ -69,16 +69,16 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;						uint8_t *pDst, 
-;						int iDstStride, 
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
 ;						int iHeight)
 ;*******************************************************************************
 McHorVer20WidthEq4_mmx:
 	push esi
 	push edi
-	
+
 	mov  esi, [esp+12]
 	mov eax, [esp+16]
 	mov edi, [esp+20]
@@ -100,7 +100,7 @@
 	punpcklbw mm4, mm7
 	movd mm5, [esi+3]
 	punpcklbw mm5, mm7
-	
+
 	paddw mm2, mm3
 	paddw mm4, mm5
 	psllw mm4, 2
@@ -113,12 +113,12 @@
 	psraw mm0, 5
 	packuswb mm0, mm7
 	movd [edi], mm0
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
+
 	WELSEMMS
 	pop edi
 	pop esi
@@ -181,8 +181,8 @@
 
 ALIGN 16
 ;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, 
-;                       int16_t iSrcStride, 
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+;                       int16_t iSrcStride,
 ;						uint8_t *pDst,
 ;						int32_t iDstStride
 ;						int32_t iHeight
@@ -197,11 +197,11 @@
 	mov edi, [esp+24]		;pDst
 	mov edx, [esp+28]	;iDstStride
 	mov ebx, [esp+32]	;iHeight
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@@ -215,7 +215,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -225,7 +225,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -238,8 +238,8 @@
 ALIGN 16
 ;***********************************************************************
 ;void_t McHorVer22VerLast_sse2(
-;											uint8_t *pSrc, 
-;											int32_t pSrcStride, 
+;											uint8_t *pSrc,
+;											int32_t pSrcStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -250,17 +250,17 @@
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@@ -272,15 +272,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@@ -290,12 +290,12 @@
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -302,61 +302,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -366,9 +366,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@@ -379,10 +379,10 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
@@ -389,18 +389,18 @@
 McHorVer20WidthEq8_sse2:
 	push	esi
 	push	edi
-	
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
-.y_loop:	
+.y_loop:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -413,7 +413,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -424,7 +424,7 @@
 	paddw xmm0, xmm4
 	paddw xmm0, xmm6
 	psraw xmm0, 5
-	
+
 	packuswb xmm0, xmm7
 	movq [edi], xmm0
 
@@ -432,17 +432,17 @@
 	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
-	
+
 	pop edi
 	pop esi
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
@@ -449,20 +449,20 @@
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
 
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -475,7 +475,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -501,7 +501,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -514,9 +514,9 @@
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	lea edi, [edi+edx]	
-	lea esi, [esi+eax]	
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
 	pop edi
@@ -525,10 +525,10 @@
 
 
 ;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int iDstStride, 
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
 ;                       int iHeight )
 ;*******************************************************************************
 ALIGN 16
@@ -535,7 +535,7 @@
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
+
 	mov esi, [esp + 12]           ;pSrc
 	mov edx, [esp + 16]	          ;iSrcStride
 	mov edi, [esp + 20]           ;pDst
@@ -546,7 +546,7 @@
 	sub esi, edx
 
 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -555,8 +555,8 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@@ -566,7 +566,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
--- a/codec/decoder/core/asm/memzero.asm
+++ b/codec/decoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -47,8 +47,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -69,7 +69,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@@ -77,12 +77,12 @@
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@@ -102,16 +102,16 @@
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -119,17 +119,17 @@
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/decoder/plus/res/welsdec.rc
+++ b/codec/decoder/plus/res/welsdec.rc
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
     "#include ""windows.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/codec/encoder/core/asm/asm_inc.asm
+++ b/codec/encoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -318,9 +318,9 @@
 SECTION .text
 
 
-	
+
 ;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
 CavlcParamCal_sse2:
@@ -327,16 +327,16 @@
 	push ebx
 	push edi
 	push esi
-	
+
 	mov			eax,	[esp+16]	;coffLevel
 	mov			edi,	[esp+24]	;Level
 	mov			ebx,	[esp+32]	;endIdx
 	cmp			ebx,	3
-	jne			.Level16	
+	jne			.Level16
 	pxor		xmm1,	xmm1
 	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin		
-.Level16:	
+	jmp			.Cal_begin
+.Level16:
 	movdqa		xmm0,	[eax]
 	movdqa		xmm1,	[eax+16]
 .Cal_begin:
@@ -354,7 +354,7 @@
 	pcmpeqw		xmm7,	xmm7	;generate -1
     mov			ebx,	0xff
     ;pinsrw		xmm6,	ebx,	3
-   
+
     mov       bl,   dh
 
 	lea       ebx,  [byte_1pos_table+8*ebx]
@@ -362,7 +362,7 @@
 	pextrw    ecx,  xmm0, 3
 	shr       ecx,  8
     mov       dh,   cl
- 
+
 .loopHighFind0:
     cmp       ecx,   0
     je        .loopHighFind0End
@@ -372,7 +372,7 @@
     add       esi, 8
     mov       esi, [eax+2*esi]
     mov       [edi], si
-    add       edi,   2 
+    add       edi,   2
     ;add       ebx,   1
     inc		  ebx
     dec       ecx
@@ -403,8 +403,8 @@
 	;and       edx, 0xff
 	movzx	  edx,	byte [ebx]
 	mov       edx, [eax+2*edx]
-	mov       [edi], dx 
-	add       edi,   2 
+	mov       [edi], dx
+	add       edi,   2
 	;add       ebx,   1
 	inc		  ebx
     dec       esi
@@ -436,8 +436,8 @@
     psllq    xmm0, xmm3
     psrlq    xmm0, xmm3
     movdqa   xmm4, xmm1
-    psllq    xmm1, xmm2 
-    psrlq    xmm4, xmm3 
+    psllq    xmm1, xmm2
+    psrlq    xmm4, xmm3
     punpcklqdq xmm1, xmm4
     por      xmm0,  xmm1
 
--- a/codec/encoder/core/asm/cpuid.asm
+++ b/codec/encoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -48,26 +48,26 @@
 
 ;***********************************************************************
 ; Constant
-;***********************************************************************		
-			
+;***********************************************************************
+
 align 16
-SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16, 
+SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
 			dw	10, 13, 10, 13, 13, 16, 13, 16,
-            dw  11, 14, 11, 14, 14, 18, 14, 18, 
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
 			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20, 
 			dw  13, 16, 13, 16, 16, 20, 16, 20,
-            dw  14, 18, 14, 18, 18, 23, 18, 23, 
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
 			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25, 
 			dw  16, 20, 16, 20, 20, 25, 20, 25,
-            dw  18, 23, 18, 23, 23, 29, 23, 29, 
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  18, 23, 18, 23, 23, 29, 23, 29,
 			dw  18, 23, 18, 23, 23, 29, 23, 29
-			
 
+
 ;***********************************************************************
 ; MMX functions
-;***********************************************************************			
+;***********************************************************************
 
 %macro MMX_LoadDiff4P 5
 	movd        %1, [%3]
@@ -112,7 +112,7 @@
     MMX_SumSub		%4, %1, %6
     MMX_SumSub		%3, %2, %6
     MMX_SumSub		%3, %4, %6
-    MMX_SumSubMul2  %1, %2, %5  
+    MMX_SumSubMul2  %1, %2, %5
 %endmacro
 
 %macro MMX_IDCT 6
@@ -145,13 +145,13 @@
     mov     edx, [esp+24]   ; i_pix2
 
     WELS_Zero    mm7
-    
+
     MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
 
-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6           
+    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
     MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
-    
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6                    
+
+    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
     MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
 
     mov     eax, [esp+ 8]   ; pDct
@@ -178,15 +178,15 @@
 %define     i_pred      esp+pushsize+16
 %define     pDct        esp+pushsize+20
 
-	mov     eax, [pDct   ] 
+	mov     eax, [pDct   ]
     movq    mm0, [eax+ 0]
     movq    mm1, [eax+ 8]
     movq    mm2, [eax+16]
     movq    mm3, [eax+24]
-    mov     edx, [p_dst ]   
-    mov     ecx, [i_dst ]   
+    mov     edx, [p_dst ]
+    mov     ecx, [i_dst ]
     mov     eax, [p_pred]
-    mov     ebx, [i_pred]     
+    mov     ebx, [i_pred]
 
 	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
 	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
@@ -195,7 +195,7 @@
 
     WELS_Zero			mm7
     WELS_DW32			mm6
-    
+
     MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [edx], [eax]
     MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
     lea     edx, [edx+2*ecx]
@@ -202,7 +202,7 @@
     lea     eax, [eax+2*ebx]
     MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [edx], [eax]
     MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-    
+
 	WELSEMMS
 %undef	pushsize
 %undef  p_dst
@@ -220,17 +220,17 @@
 %macro SSE2_Store4x8p 6
 	SSE2_XSawp qdq, %2, %3, %6
 	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2 
-	MOVDQ    [%1+0x10], %4 
-	MOVDQ    [%1+0x20], %6 
-	MOVDQ    [%1+0x30], %3 
+	MOVDQ    [%1+0x00], %2
+	MOVDQ    [%1+0x10], %4
+	MOVDQ    [%1+0x20], %6
+	MOVDQ    [%1+0x30], %3
 %endmacro
 
 %macro SSE2_Load4x8p 6
 	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]  
-	MOVDQ    %6,	[%1+0x20]  
-	MOVDQ    %3,	[%1+0x30]  
+	MOVDQ    %4,	[%1+0x10]
+	MOVDQ    %6,	[%1+0x20]
+	MOVDQ    %3,	[%1+0x30]
 	SSE2_XSawp qdq, %4, %3, %5
 	SSE2_XSawp qdq, %2, %6, %3
 %endmacro
@@ -271,40 +271,40 @@
 %endmacro
 
 %macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1	
+	movdqa		%1,		%6		; %1 = dc0 dc1
 	paddw       %1,		%5
-    psraw       %1,		$6		; (dc + 32) >> 6	
-    
+    psraw       %1,		$6		; (dc + 32) >> 6
+
     movdqa		%2,		%1
     psrldq		%2,		4
  	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3	   
+	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
 
     movdqa		%3,		%1
     psrldq		%3,		8
  	punpcklwd	%3,		%3
 	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-	
+
 	movdqa		%4,		%1
     psrldq		%4,		12
  	punpcklwd	%4,		%4
 	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-	    	
+
 	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1	
+	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
 %endmacro
 
 %macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5						
-	SSE2_SumSub		%1, %2, %5																		
-	SSE2_SumSub		%3, %2, %5					
-	SSE2_SumSubMul2		%6, %1, %4               	
+    SSE2_SumSub		%6, %3,	%5
+	SSE2_SumSub		%1, %2, %5
+	SSE2_SumSub		%3, %2, %5
+	SSE2_SumSubMul2		%6, %1, %4
 %endmacro
 
 %macro SSE2_IDCT 7
-    SSE2_SumSub       %7, %2, %6					
-    SSE2_SumSubDiv2     %1, %3, %5, %4              
-    SSE2_SumSub	     %2, %1, %5 
+    SSE2_SumSub       %7, %2, %6
+    SSE2_SumSubDiv2     %1, %3, %5, %4
+    SSE2_SumSub	     %2, %1, %5
     SSE2_SumSub		 %7, %4, %5
 %endmacro
 
@@ -316,12 +316,12 @@
 WelsDctFourT4_sse2:
     push    ebx
     push	esi
-    mov		esi, [esp+12] 
+    mov		esi, [esp+12]
     mov     eax, [esp+16]   ; pix1
     mov     ebx, [esp+20]   ; i_pix1
     mov     ecx, [esp+24]   ; pix2
-    mov     edx, [esp+28]   ; i_pix2    
-    
+    mov     edx, [esp+28]   ; i_pix2
+
     pxor    xmm7, xmm7
 
 	;Load 4x8
@@ -331,33 +331,33 @@
 	lea		ecx, [ecx + 2 * edx]
 	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
 	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2             		
+	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5  
-	
+
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-    
+
 	;Load 4x8
 	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx    ]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
     SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1		
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2              		
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
+
 	lea		esi, [esi+64]
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5 
-	
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
     pop esi
     pop ebx
     ret
@@ -377,21 +377,21 @@
 %define	pushsize	8
     push		ebx
     push		esi
-    
-    mov			eax,		[rec]   
-    mov			ebx,		[stride]   
-    mov			ecx,		[pred]  
-    mov			edx,		[pred_stride]   
-    mov			esi,		[rs]  
 
+    mov			eax,		[rec]
+    mov			ebx,		[stride]
+    mov			ecx,		[pred]
+    mov			edx,		[pred_stride]
+    mov			esi,		[rs]
+
 	;Load 4x8
-	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
   	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
     SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-    
+
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
 
@@ -398,41 +398,41 @@
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
 	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
-   
+
     add		esi, 64
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0           
+	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
 	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
-    
+
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx] 
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx]
 
     pop		esi
     pop		ebx
     ret
-    
+
   %macro SSE2_StoreDiff4x8p 8
    	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]	
+	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
 	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]	
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
  %endmacro
- 
+
  ;***********************************************************************
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
@@ -443,47 +443,47 @@
 WelsIDctRecI16x16Dc_sse2:
     push		esi
     push		edi
-    
+
 	mov			ecx,		[luma_dc]
-    mov			eax,		[rec]	
-    mov			edx,		[stride]	
-    mov			esi,		[pred]	
-    mov			edi,		[pred_stride]	    	
+    mov			eax,		[rec]
+    mov			edx,		[stride]
+    mov			esi,		[pred]
+    mov			edi,		[pred_stride]
 	pxor		xmm7,		xmm7
     WELS_DW32	xmm6
-    
+
 	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]	
+
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	 
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-		
+
     pop		edi
     pop		esi
     ret
@@ -517,7 +517,7 @@
 	punpckldq	%3,			%4
 	punpcklqdq	%1,			%3
  %endmacro
- 
+
 ;***********************************************************************
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
@@ -525,23 +525,23 @@
 WelsHadamardT4Dc_sse2:
 		mov			eax,		[esp + 4]	; luma_dc
 		mov			ecx,		[esp + 8]	; pDct
-		
+
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, ecx
 		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, ecx + 0x40
 		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, ecx + 0x100
 		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-		
+
 		SSE2_SumSubD		xmm1, xmm2, xmm7
 		SSE2_SumSubD		xmm3, xmm4, xmm7
 		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7	
+		SSE2_SumSubD		xmm1, xmm3, xmm7
 
 		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
-	
+
 		SSE2_SumSubD		xmm4, xmm3, xmm7
 		SSE2_SumSubD		xmm5, xmm1, xmm7
 
-		WELS_DD1 xmm6      
+		WELS_DD1 xmm6
 		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
 		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
         SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
@@ -550,7 +550,7 @@
 		packssdw	xmm2,	xmm1
 		movdqa	[eax+ 0],   xmm3
 		movdqa	[eax+16],   xmm2
-		
-		ret	
+
+		ret
 
 
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-  
-DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-  
-WELS_EXTERN  DeblockChromaLt4H_sse2
-  
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-  
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-  
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-  
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-  
-    
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-    
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-    
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-    
-    
-    
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-    
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-      
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-    
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
-    mov      esp,   ebp
-    pop      ebp
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+
+DeblockChromaEq4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4H_sse2
+
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h
+
+    mov     eax,   [ebp + 0Ch]
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+
+    movq    xmm0,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax]
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2]
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0
+
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+
+
+
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h
+
+    mov      eax,   [ebp + 10h]
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    lea      eax,   [ecx * 3]
+
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0
+
+
+    mov      esp,   ebp
+    pop      ebp
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ b/codec/encoder/core/asm/expand_picture.asm
@@ -153,11 +153,11 @@
 	lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@@ -171,7 +171,7 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@@ -182,15 +182,15 @@
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@@ -200,21 +200,21 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops
 
@@ -241,13 +241,13 @@
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
@@ -254,37 +254,37 @@
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@@ -337,25 +337,25 @@
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 
 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
 
 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 
 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@@ -373,7 +373,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -385,10 +385,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -396,16 +396,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -417,7 +417,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -424,7 +424,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -434,7 +434,7 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@@ -442,19 +442,19 @@
 	mov ecx, [esp+28]					; stride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -stride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -470,7 +470,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -482,10 +482,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -493,16 +493,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -514,7 +514,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -521,7 +521,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -531,9 +531,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -543,15 +543,15 @@
 	neg ecx										; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -567,7 +567,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -579,10 +579,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -590,16 +590,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -611,7 +611,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -618,7 +618,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -628,9 +628,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -stride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -640,14 +640,14 @@
 	neg ecx									; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -95,13 +95,13 @@
 	punpcklbw	%1,	%3
 	movdqa		%3,	%1
 	punpcklbw	%1,	%3
-	
+
 	;add			%4,	%5
 	movd		%2,	[%4+%5-1]
 	movdqa		%3,	%2
 	punpcklbw	%2,	%3
 	movdqa		%3,	%2
-	punpcklbw	%2,	%3	
+	punpcklbw	%2,	%3
 	punpckldq	%1,	%2
 %endmacro
 
@@ -126,24 +126,24 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%4,	[%5]
 		movd	%2,	[%5+%6]
 		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%3,	[%5]
 		movd	%2,	[%5+%6]
 		lea		%5,	[%5+2*%6]
 		punpcklbw %3,	%2
 		punpcklwd %4,	%3
-		punpckhdq %1,	%4	
-%endmacro	
+		punpckhdq %1,	%4
+%endmacro
 
 %macro  SUMW_HORIZON 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -173,7 +173,7 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]			
+		lea		%5,	[%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
@@ -197,7 +197,7 @@
 ALIGN 16
 ;***********************************************************************
 ;   void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   
+;
 ;	pred must align to 16
 ;***********************************************************************
 WelsI4x4LumaPredH_sse2:
@@ -207,11 +207,11 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm0,	edx
 	pmuludq		xmm0,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm1,	edx
 	pmuludq		xmm1,	[mmx_01bytes]
-	
+
 	unpcklps	xmm0,	xmm1
 
 	lea			eax,	[eax+ecx*2]
@@ -218,19 +218,19 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm2,	edx
 	pmuludq		xmm2,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm3,	edx	
+	movd		xmm3,	edx
 	pmuludq		xmm3,	[mmx_01bytes]
-	
+
 	unpcklps	xmm2,	xmm3
 	unpcklpd	xmm0,	xmm2
-	
+
 	mov			edx,	[esp+4]			;pred
 	movdqa		[edx],	xmm0
-	
+
 	ret
-	
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -241,9 +241,9 @@
 		mov		ecx,	[esp + pushsize + 12]
 		sub		esi,	1
 		sub		esi,	ecx
-		
+
 		;for H
-		pxor	xmm7,	xmm7	
+		pxor	xmm7,	xmm7
 		movq	xmm0,	[esi]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
@@ -253,7 +253,7 @@
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
 		psubw	xmm1,	xmm0
-		
+
 		SUMW_HORIZON	xmm1,xmm0,xmm2
 		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
 		movsx	eax,	ax
@@ -261,26 +261,26 @@
 		add		eax,	32
 		sar		eax,	6			; b = (5 * H + 32) >> 6;
 		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
-		
-		movzx	edx,	BYTE [esi+16]	
+
+		movzx	edx,	BYTE [esi+16]
 		sub	esi, 3
 		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
-			
+
 		add		esi,	3
 		movzx	eax,	BYTE [esi+8*ecx]
 		add		edx,	eax
 		shl		edx,	4			;	a = (left[15*stride] + top[15]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
-		pxor	xmm4,	xmm4	
+		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
 		punpckhbw xmm7,	xmm4
 		pmullw	xmm7,	xmm6
 		psubw	xmm7,	xmm0
-		
+
 		SUMW_HORIZON   xmm7,xmm0,xmm2
 		movd    eax,   xmm7			; V
 		movsx	eax,	ax
@@ -288,17 +288,17 @@
 		imul	eax,	5
 		add		eax,	32
 		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_inc_minus]
-		
+
 get_i16x16_luma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -307,7 +307,7 @@
 		movdqa	xmm3,	xmm1
 		pmullw	xmm3,	xmm6
 		paddw	xmm3,	xmm0
-		psraw	xmm3,	5	
+		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
 		movdqa	[esi],	xmm2
 		paddw	xmm0,	xmm4
@@ -314,13 +314,13 @@
 		add		esi,	16
 		inc		eax
 		cmp		eax,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1					
-		
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
 		pop		esi
 		ret
-		
-		
-		
+
+
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -327,7 +327,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE 1
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES	eax,	xmm0
     movdqa			[edx+%1],	xmm0
    COPY_16_TIMESS eax,	xmm0,	ecx
@@ -340,13 +340,13 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [edx],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
 	SSE2_PRED_H_16X16_TWO_LINE   0x40
 	SSE2_PRED_H_16X16_TWO_LINE   0x60
 	SSE2_PRED_H_16X16_TWO_LINE   0x80
@@ -353,9 +353,9 @@
 	SSE2_PRED_H_16X16_TWO_LINE   0xa0
 	SSE2_PRED_H_16X16_TWO_LINE   0xc0
 	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -364,10 +364,10 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     sub     eax, ecx
     movdqa  xmm0, [eax]
-    
+
     movdqa  [edx], xmm0
     movdqa  [edx+10h], xmm0
     movdqa  [edx+20h], xmm0
@@ -378,15 +378,15 @@
     movdqa  [edx+70h], xmm0
     movdqa  [edx+80h], xmm0
     movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
+    movdqa  [edx+160], xmm0
 	movdqa  [edx+176], xmm0
     movdqa  [edx+192], xmm0
     movdqa  [edx+208], xmm0
     movdqa  [edx+224], xmm0
     movdqa  [edx+240], xmm0
-    
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -398,8 +398,8 @@
 		mov		ecx,	[esp + pushsize + 12]	;stride
 		sub		esi,	1
 		sub		esi,	ecx
-		
-		pxor	mm7,	mm7	
+
+		pxor	mm7,	mm7
 		movq	mm0,	[esi]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
@@ -409,7 +409,7 @@
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
 		psubw	mm1,	mm0
-		
+
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
@@ -419,7 +419,7 @@
 		add		eax,	16
 		sar		eax,	5			; b = (17 * H + 16) >> 5;
 		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
-		
+
 		movzx	edx,	BYTE [esi+8]
 		sub	esi, 3
 		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
@@ -428,17 +428,17 @@
 		movzx	eax,	BYTE [esi+4*ecx]
 		add		edx,	eax
 		shl		edx,	4			; a = (left[7*stride] + top[7]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
-		pxor	mm4,	mm4	
+		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
 		punpckhbw mm7,	mm4
 		pmullw	mm7,	mm6
 		psubw	mm7,	mm0
-		
+
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
@@ -448,17 +448,17 @@
 		imul	eax,	17
 		add		eax,	16
 		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
-		
+
 get_i_chroma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -470,12 +470,12 @@
 		add		esi,	8
 		inc		eax
 		cmp		eax,	8
-		jnz get_i_chroma_pred_plane_sse2_1					
-		
+		jnz get_i_chroma_pred_plane_sse2_1
+
 		pop		esi
 		WELSEMMS
-		ret	
-		
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -487,13 +487,13 @@
 ;	pred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;   
+;
 ;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:	
+WelsI4x4LumaPredDDR_mmx:
 	mov			edx,[esp+4]			;pred
 	mov         eax,[esp+8]			;pRef
 	mov			ecx,[esp+12]		;stride
-	
+
 	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
 	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
@@ -520,17 +520,17 @@
 	pand        mm1,[mmx_01bytes]	;set the odd bit
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-	
-	movd        [edx+12],mm2 
-	psrlq       mm2,8 
-	movd        [edx+8],mm2 
-	psrlq       mm2,8 
-	movd        [edx+4],mm2 
-	psrlq       mm2,8 
+
+	movd        [edx+12],mm2
+	psrlq       mm2,8
+	movd        [edx+8],mm2
+	psrlq       mm2,8
+	movd        [edx+4],mm2
+	psrlq       mm2,8
 	movd        [edx],mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -542,44 +542,44 @@
 ;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;   
+;
 ;***********************************************************************
-WelsI4x4LumaPredDc_sse2:	
+WelsI4x4LumaPredDc_sse2:
 	mov         eax,[esp+8]			;pRef
 	mov			ecx,[esp+12]		;stride
 	push		ebx
-		
+
 	movzx		edx,	byte [eax-1h]
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
-	
+
 	movd		ebx,	xmm0
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2-1h]
 	add			ebx,	edx
-	
+
 	lea			eax,	[eax+ecx*2-1]
 	movzx		edx,	byte [eax+ecx]
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2]
 	add			ebx,	edx
 	add			ebx,	4
 	sar			ebx,	3
 	imul		ebx,	0x01010101
-	
+
 	mov			edx,	[esp+8]			;pred
 	movd		xmm0,	ebx
 	pshufd		xmm0,	xmm0,	0
 	movdqa		[edx],	xmm0
-				
+
 	pop ebx
-	ret	
-	
+	ret
+
 ALIGN 16
 ;***********************************************************************
 ;	void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -588,7 +588,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINE 4
 	movq		%1,		[%3-8]
 	psrlq		%1,		38h
-	
+
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
@@ -598,7 +598,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
 	movq		%1,		[%3+ecx-8]
 	psrlq		%1,		38h
-	
+
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
@@ -610,34 +610,34 @@
 	mov			edx,	[esp+4]			;pred
 	mov         eax,	[esp+8]			;pRef
 	mov			ecx,	[esp+12]		;stride
-	
+
 	movq		mm0,	[eax-8]
 	psrlq		mm0,	38h
-	
+
 	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
 	movq		[edx],	mm0
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+8
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+16
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+24
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+32
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+40
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+48
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56		
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56
 	WELSEMMS
-	ret	
-	
+	ret
+
 ALIGN 16
 ;***********************************************************************
 ;	void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -648,12 +648,12 @@
 	mov			edx,	[esp+4]			;pred
 	mov         eax,	[esp+8]			;pRef
 	mov			ecx,	[esp+12]		;stride
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pshufd		xmm0,	xmm0,	0
 	movdqa		[edx],	xmm0
-	ret	
+	ret
 
 ALIGN 16
 ;***********************************************************************
@@ -665,7 +665,7 @@
 	mov			edx,		[esp+4]			;pred
 	mov         eax,		[esp+8]			;pRef
 	mov			ecx,		[esp+12]		;stride
-	
+
 	sub			eax,		ecx
 	movq		xmm0,		[eax]
 	movdqa		xmm1,		xmm0
@@ -676,8 +676,8 @@
 	movdqa		[edx+32],	xmm0
 	movdqa		[edx+48],	xmm0
 	ret
-	
-	
+
+
 	ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -703,13 +703,13 @@
 
 ;   f = (2 + l1 + (l0<<1) + lt)>>2
 ;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   j = (2 + l3 + (l2<<1) + l1)>>2
 ;   [b a f e h g j i] + [d c b a] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:	
+WelsI4x4LumaPredHD_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -716,16 +716,16 @@
 	sub         eax, ecx
 	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]        
+	movd        mm2, [eax+2*ecx-4]
 	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-	
+
 	movq        mm1, mm0
 	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
 	movq        mm2, mm0
@@ -733,17 +733,17 @@
 	movq        mm3, mm2
 	movq        mm4, mm1
 	pavgb       mm1, mm0
-	
+
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-	
+
 	movq        mm4, mm0
 	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
 	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-	
+
 	psrlq       mm2, 20h
 	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
 	movq        mm4, mm3
@@ -750,7 +750,7 @@
 	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-	
+
 	movd        [edx], mm2
 	movd        [edx+12], mm3
 	psrlq       mm3, 10h
@@ -759,9 +759,9 @@
 	movd        [edx+4], mm3
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -784,17 +784,17 @@
 ;   b = (2 + l0 + (l1<<1) + l2)>>2
 ;   d = (2 + l1 + (l2<<1) + l3)>>2
 ;   f = (2 + l2 + (l3<<1) + l3)>>2
- 
+
 ;   [g g f e d c b a] + [g g g g] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:	
+WelsI4x4LumaPredHU_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
-	
+
 	movd        mm0, [eax-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         eax, [eax+2*ecx]
@@ -802,38 +802,38 @@
 	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-	
+
 	psrlq       mm4, 18h
 	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
 	psrlq       mm0, 8h
 	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
 	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
 	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
 	movq        mm5, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
 	pand        mm5, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm5				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-	
+
 	psrlq       mm2, 8h
 	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-	
+
 	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-	
+
 	psrlq       mm4, 20h
 	movd        [edx+12], mm4
-	
+
 	movd        [edx], mm1
 	psrlq       mm1, 10h
 	movd        [edx+4], mm1
@@ -841,9 +841,9 @@
 	movd        [edx+8], mm1
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -869,12 +869,12 @@
 
 ;   h = (2 + t1 + (t2<<1) + t3)>>2
 ;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2   
-;   
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
 ;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:	
+WelsI4x4LumaPredVR_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -881,57 +881,57 @@
 	sub         eax, ecx
 	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
 	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
 	movq        mm3, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm3				; decrease 1 from odd bytes
-	
+
 	movq        mm3, mm0
 	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
 	movq        mm2, mm3
-	
+
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
 	movd        [edx], mm1
-	
+
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
 	movd        [edx+4], mm2
-	
+
 	movq        mm4, mm3
 	psllq       mm4, 20h
 	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-	
+
 	movq        mm5, mm3
 	psllq       mm5, 28h
 	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-	
+
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
 	movd        [edx+8], mm4
-	
+
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
 	movd        [edx+12], mm5
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -954,13 +954,13 @@
 ;   e = (2 + t4 + t6 + (t5<<1))>>2
 ;   f = (2 + t5 + t7 + (t6<<1))>>2
 ;   g = (2 + t6 + t7 + (t7<<1))>>2
- 
+
 ;   [g f e d c b a] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:	
+WelsI4x4LumaPredDDL_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -968,11 +968,11 @@
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	movq        mm3, mm0
 	psrlq       mm3, 38h
 	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-	
+
 	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
 	psrlq       mm2, 8h
 	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -982,9 +982,9 @@
 	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm3				; decrease 1 from odd bytes
-	
+
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-	
+
 	psrlq       mm0, 8h
 	movd        [edx], mm0
 	psrlq       mm0, 8h
@@ -995,8 +995,8 @@
 	movd        [edx+12], mm0
 	WELSEMMS
 	ret
-	
-	
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1022,46 +1022,46 @@
 ;   g = (2 + t2 + (t3<<1) + t4)>>2
 ;   h = (2 + t3 + (t4<<1) + t5)>>2
 ;   j = (2 + t4 + (t5<<1) + t6)>>2
- 
+
 ;   [i d c b a] + [j h g f e] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:	
+WelsI4x4LumaPredVL_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
 	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
 	movq        mm3, mm1
 	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-	
+
 	movq        mm4, mm2
-	pavgb       mm2, mm0	
+	pavgb       mm2, mm0
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-	
+
 	movd        [edx], mm3
 	psrlq       mm3, 8h
 	movd        [edx+8], mm3
-	
+
 	movd        [edx+4], mm2
 	psrlq       mm2, 8h
 	movd        [edx+12], mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;
@@ -1068,14 +1068,14 @@
 ;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:	
+WelsIChromaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+12]			; pRef
 	mov			ecx, [esp+16]           ; stride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01] ; l1
 	movzx		ebx, byte [eax+ecx-0x01] ; l1
@@ -1089,7 +1089,7 @@
 	movzx		edx, byte [eax-0x01]     ; l4
 	add			ebx, edx
 	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01] ; l5
 	movzx		ebx, byte [eax+ecx-0x01] ; l5
@@ -1103,7 +1103,7 @@
 	movzx		edx, byte [eax-0x01]     ; l8
 	add			ebx, edx
 	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-	
+
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1110,56 +1110,56 @@
 	psrlq       mm3, 0x20
 	pxor		mm4, mm4
 	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
-	
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
 	paddq       mm3, mm1
 	movq        mm1, mm2
 	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-	
+
 	movq        mm4, [mmx_0x02]
-	
+
 	paddq       mm0, mm4
 	psrlq       mm0, 0x02
-	
+
 	paddq       mm2, mm4
 	psrlq       mm2, 0x02
-	
+
 	paddq       mm3, mm4
 	paddq       mm3, mm4
 	psrlq       mm3, 0x03
-	
+
 	paddq       mm1, mm4
 	paddq       mm1, mm4
 	psrlq       mm1, 0x03
-	
+
 	pmuludq     mm0, [mmx_01bytes]
 	pmuludq     mm3, [mmx_01bytes]
 	psllq       mm0, 0x20
 	pxor        mm0, mm3                 ; mm0 = m_up
-	
+
 	pmuludq     mm2, [mmx_01bytes]
 	pmuludq     mm1, [mmx_01bytes]
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
-	
+
 	mov         edx, [esp+8]			 ; pRef
-	
+
 	movq        [edx], mm0
 	movq        [edx+0x08], mm0
 	movq        [edx+0x10], mm0
 	movq        [edx+0x18], mm0
-	
+
 	movq        [edx+0x20], mm1
 	movq        [edx+0x28], mm1
 	movq        [edx+0x30], mm1
 	movq        [edx+0x38], mm1
-	
+
 	pop         ebx
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;
@@ -1166,11 +1166,11 @@
 ;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:	
+WelsI16x16LumaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+12]			; pRef
 	mov			ecx, [esp+16]           ; stride
-	
+
 	sub         eax, ecx
 	movdqa      xmm0, [eax]             ; read one row
 	pxor		xmm1, xmm1
@@ -1180,7 +1180,7 @@
 	pslldq      xmm0, 0x08
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01]
 	movzx		ebx, byte [eax+ecx-0x01]
@@ -1201,7 +1201,7 @@
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
-	
+
 	mov         edx, [esp+8]			; pred
 	movdqa      [edx], xmm0
 	movdqa      [edx+0x10], xmm0
@@ -1219,7 +1219,7 @@
 	movdqa      [edx+0xd0], xmm0
 	movdqa      [edx+0xe0], xmm0
 	movdqa      [edx+0xf0], xmm0
-	
+
 	pop         ebx
 
 	ret
@@ -1226,7 +1226,7 @@
 
 ;***********************************************************************
 ;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, 
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
 ;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
 ;
 ;***********************************************************************
@@ -1238,7 +1238,7 @@
 	push      edi
 	mov       eax,  [esp+24];p_enc
 	mov       ebx,  [esp+28];linesize_enc
-	
+
 	; load source 4x4 samples and Hadamard transform
     movd      xmm0, [eax]
     movd      xmm1, [eax+ebx]
@@ -1247,16 +1247,16 @@
     movd      xmm3, [eax+ebx]
     punpckldq xmm0, xmm2
     punpckldq xmm1, xmm3
-       
+
     pxor      xmm6, xmm6
     punpcklbw xmm0, xmm6
     punpcklbw xmm1, xmm6
-    
+
     movdqa    xmm2, xmm0
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
     SSE2_XSawp  qdq, xmm0, xmm2, xmm3
-    
+
     movdqa    xmm4, xmm0
     paddw     xmm0, xmm3
     psubw     xmm4, xmm3
@@ -1264,7 +1264,7 @@
     movdqa    xmm2, xmm0
     punpcklwd xmm0, xmm4
     punpckhwd xmm4, xmm2
-    
+
 	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
 	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
 
@@ -1271,14 +1271,14 @@
     movdqa    xmm7, xmm0
     paddw     xmm0, xmm5
     psubw     xmm7, xmm5
-    
+
 	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
-    
+
     ; Hadamard transform results are saved in xmm0 and xmm2
     movdqa    xmm2, xmm0
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
-  	
+
 	; load top boundary samples: [a b c d]
     mov       eax,  [esp+16];p_dec
 	sub		  eax,	[esp+20];linesize_dec
@@ -1286,7 +1286,7 @@
 	movzx     edx,  byte [eax+1]
 	movzx     esi,  byte [eax+2]
 	movzx     edi,  byte [eax+3]
-	
+
 	; get the transform results of top boundary samples: [a b c d]
 	add       edx, ecx ; edx = a + b
 	add       edi, esi ; edi = c + d
@@ -1300,7 +1300,7 @@
 	add       esi, ecx ; esi = (a - b) + (c - d)
 	add       ecx, ecx
 	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-	
+
 	movdqa    xmm6, xmm0
 	movdqa    xmm7, xmm2
 	movd      xmm5, edi ; store the edi for DC mode
@@ -1312,16 +1312,16 @@
 	pinsrw    xmm4, edx, 0
 	pinsrw    xmm4, ecx, 4
 	psllw     xmm4, 2
-	
+
 	; get the satd of H
 	psubw     xmm0, xmm3
 	psubw     xmm2, xmm4
-	
+
 	WELS_AbsW  xmm0, xmm1
 	WELS_AbsW  xmm2, xmm1
     paddusw        xmm0, xmm2
     SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
-	
+
 	; load left boundary samples: [a b c d]'
     mov       eax,  [esp+16]
 	mov       ebx,  [esp+20]
@@ -1330,7 +1330,7 @@
 	lea       eax , [eax+2*ebx]
 	movzx     esi,  byte [eax-1]
 	movzx     edi,  byte [eax+ebx-1]
-	
+
 	; get the transform results of left boundary samples: [a b c d]'
 	add       edx, ecx ; edx = a + b
 	add       edi, esi ; edi = c + d
@@ -1344,14 +1344,14 @@
 	add       esi, ecx ; esi = (a - b) + (c - d)
 	add       ecx, ecx
 	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-	
-	; store the transform results in xmm3	
+
+	; store the transform results in xmm3
     movd      xmm3, edi
 	pinsrw    xmm3, edx, 1
 	pinsrw    xmm3, ecx, 2
 	pinsrw    xmm3, esi, 3
 	psllw     xmm3, 2
-	
+
 	; get the satd of V
 	movdqa    xmm2, xmm6
 	movdqa    xmm4, xmm7
@@ -1368,7 +1368,7 @@
 	psrlw     xmm1, 3
 	movdqa    xmm5, xmm1
 	psllw     xmm1, 4
-	
+
     ; get the satd of DC
     psubw          xmm6, xmm1
     WELS_AbsW  xmm6, xmm1
@@ -1375,7 +1375,7 @@
 	WELS_AbsW  xmm7, xmm1
     paddusw        xmm6, xmm7
     SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
-    
+
     ; comparing order: DC H V
     mov       edx, [esp+32]
     movd      eax, xmm6
@@ -1394,9 +1394,9 @@
     jg near   not_dc
     cmp       ax, si
     jg near   not_dc_h
-    
+
     ; for DC mode
-    movd      ebx, xmm5 
+    movd      ebx, xmm5
     imul      ebx, 0x01010101
     movd	  xmm5, ebx
 	pshufd    xmm5, xmm5, 0
@@ -1407,11 +1407,11 @@
     pop       esi
     pop       ebx
     ret
-    
+
 not_dc:
     cmp       di, si
     jg near   not_dc_h
-    
+
     ; for H mode
     SSE_DB_1_2REG  xmm6, xmm7
     mov       eax,  [esp+16]
@@ -1422,20 +1422,20 @@
 
 	movzx     ecx,  byte [eax+ebx-1]
 	movd      xmm1, ecx
-    pmuludq   xmm1, xmm6 
+    pmuludq   xmm1, xmm6
 %if 1
     punpckldq xmm0, xmm1
-%else    
+%else
 	unpcklps  xmm0,	xmm1
 %endif
 	lea       eax,	[eax+ebx*2]
 	movzx	  ecx,	byte [eax-1]
 	movd	  xmm2,	ecx
-    pmuludq   xmm2, xmm6  
+    pmuludq   xmm2, xmm6
 
 	movzx	  ecx,	byte [eax+ebx-1]
-	movd	  xmm3,	ecx	
-    pmuludq   xmm3, xmm6  
+	movd	  xmm3,	ecx
+    pmuludq   xmm3, xmm6
 %if 1
     punpckldq  xmm2, xmm3
     punpcklqdq xmm0, xmm2
@@ -1442,13 +1442,13 @@
 %else
 	unpcklps  xmm2,	xmm3
 	unpcklpd  xmm0,	xmm2
-%endif	
+%endif
 	movdqa	  [edx],xmm0
-	
+
 	mov       eax, edi
     mov       ebx, [esp+36]
 	mov       dword [ebx], 0x01
-    
+
     pop       edi
     pop       esi
     pop       ebx
@@ -1460,14 +1460,14 @@
 	movd	  xmm0,	[eax]
 	pshufd	  xmm0,	xmm0, 0
 	movdqa	  [edx],xmm0
-	
+
 	mov       eax, esi
     mov       ebx, [esp+36]
 	mov       dword [ebx], 0x00
-    
+
     pop       edi
     pop       esi
     pop       ebx
     ret
-    
+
 
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@@ -32,7 +32,7 @@
 ;*  intra_pred_util.asm
 ;*
 ;*  Abstract
-;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and 
+;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
 ;*		WelsFillingPred1to16 etc.
 ;*
 ;*  History
@@ -84,7 +84,7 @@
 	movq mm0, [ecx]
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 	ret
 
@@ -100,16 +100,16 @@
 	movq mm1, [ecx+8]
 	movq [eax  ], mm0
 	movq [eax+8], mm1
-	
+
 	WELSEMMS
 
 	ret
 
 %macro butterfly_1to8_mmx	3	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l	
-	movd %2, e%3x		; i.e, 1% = eax (=b0)	
-	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro 
+	mov %3h, %3l
+	movd %2, e%3x		; i.e, 1% = eax (=b0)
+	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ALIGN 16
 ;***********************************************************************----------------
@@ -120,10 +120,10 @@
 
 	mov cl, byte [esp+8]	; v
 	butterfly_1to8_mmx	mm0, mm1, c	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	
+
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 
 	ret
@@ -136,9 +136,9 @@
 	mov eax, [esp+4]	; pred
 	mov ecx, [esp+8]	; v
 
-	movdqa xmm0, [ecx]	
-	movdqa [eax], xmm0	
-	
+	movdqa xmm0, [ecx]
+	movdqa [eax], xmm0
+
 	ret
 
 ALIGN 16
@@ -150,7 +150,7 @@
 
 	mov cl, byte [esp+8]	; v
 	butterfly_1to16_sse	xmm0, xmm1, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	
+
 	movdqa [eax], xmm0
-	
+
 	ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ b/codec/encoder/core/asm/mb_copy.asm
@@ -32,7 +32,7 @@
 ;*  mb_copy.asm
 ;*
 ;*  Abstract
-;*      mb_copy 
+;*      mb_copy
 ;*
 ;*
 ;*********************************************************************************************/
@@ -52,9 +52,9 @@
 WELS_EXTERN WelsCopy16x16_sse2
 WELS_EXTERN WelsCopy16x16NotAligned_sse2
 WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	; 
-WELS_EXTERN WelsCopy8x16_mmx		; 
-WELS_EXTERN UpdateMbMv_sse2		; 
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
+WELS_EXTERN WelsCopy8x16_mmx		;
+WELS_EXTERN UpdateMbMv_sse2		;
 
 ;***********************************************************************
 ; void WelsCopy16x16_sse2(	uint8_t* Dst,
@@ -66,7 +66,7 @@
 WelsCopy16x16_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -107,7 +107,7 @@
 	movdqa xmm5, [esi+ecx]
 	movdqa xmm6, [esi+2*ecx]
 	movdqa xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@@ -116,7 +116,7 @@
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
+	movdqa [edi+ebx], xmm7
 
 	pop ebx
 	pop edi
@@ -134,7 +134,7 @@
 WelsCopy16x16NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -175,7 +175,7 @@
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
 	movdqu xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@@ -184,8 +184,8 @@
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
-	
+	movdqa [edi+ebx], xmm7
+
 	pop ebx
 	pop edi
 	pop esi
@@ -202,7 +202,7 @@
 WelsCopy16x8NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -220,7 +220,7 @@
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]	
+	movdqu xmm7, [esi+edx]
 
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
@@ -231,7 +231,7 @@
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
 	movdqa [edi+ebx], xmm7
-	
+
 	pop ebx
 	pop edi
 	pop esi
@@ -245,7 +245,7 @@
 ;                       int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x16_mmx:	
+WelsCopy8x16_mmx:
 	push ebx
 
 	mov eax, [esp + 8 ]           ;Dst
@@ -253,60 +253,60 @@
 	mov ebx, [esp + 16]           ;Src
 	mov edx, [esp + 20]           ;iStrideS
 
-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
+	movq [eax], mm6
 	movq [eax+ecx], mm7
 	lea eax, [eax+2*ecx]
 
-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]		
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
-	movq [eax+ecx], mm7	
+	movq [eax], mm6
+	movq [eax+ecx], mm7
 
 	WELSEMMS
-	pop ebx	
+	pop ebx
 	ret
-	
+
 ;***********************************************************************
 ; void WelsCopy8x8_mmx(  uint8_t* Dst,
 ;                        int32_t  iStrideD,
@@ -314,7 +314,7 @@
 ;                        int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x8_mmx:	
+WelsCopy8x8_mmx:
 	push ebx
 	push esi
 	mov eax, [esp + 12]           ;Dst
@@ -343,7 +343,7 @@
 	lea esi, [esi+2*ebx]
 	movq mm6, [esi]
 	movq mm7, [esi+ebx]
-	
+
 	movq [eax], mm0
 	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
@@ -355,12 +355,12 @@
 	lea eax, [eax+2*ecx]
 	movq [eax], mm6
 	movq [eax+ecx], mm7
-		
+
 	WELSEMMS
-	pop esi	
+	pop esi
 	pop ebx
 	ret
-	
+
 ; (dunhuang@cisco), 12/21/2011
 ;***********************************************************************
 ; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
@@ -417,8 +417,8 @@
 WELS_EXTERN McCopyWidthEq4_mmx
 WELS_EXTERN McCopyWidthEq8_mmx
 WELS_EXTERN McCopyWidthEq16_sse2
-                          
 
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq8_mmx( uint8_t *dst,  int32_t iDstStride,
@@ -432,19 +432,19 @@
     push        esi
     push        edi
 
-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
-	movq        mm0, [esi]	
+	movq        mm0, [esi]
     pavgb       mm0, [edx]
     movq        [edi], mm0
-	movq		mm1, [esi+eax]		
+	movq		mm1, [esi+eax]
 	pavgb		mm1, [edx+ebx]
 	movq		[edi+ebp], mm1
 	lea         edi, [edi+2*ebp]
@@ -451,19 +451,19 @@
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
 
-	movq        mm2, [esi]	
+	movq        mm2, [esi]
 	pavgb       mm2, [edx]
     movq        [edi], mm2
-	movq		mm3, [esi+eax]	
+	movq		mm3, [esi+eax]
 	pavgb		mm3, [edx+ebx]
 	movq		[edi+ebp], mm3
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
-	
+
 	dec         ecx
     jne         .height_loop
-	
+
 	WELSEMMS
     pop         edi
     pop         esi
@@ -485,19 +485,19 @@
     push        esi
     push        edi
 
-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
 	movdqu      xmm0, [esi]
 	movdqu      xmm1, [edx]
 	movdqu      xmm2, [esi+eax]
-	movdqu      xmm3, [edx+ebx]	
+	movdqu      xmm3, [edx+ebx]
 	pavgb       xmm0, xmm1
 	pavgb       xmm2, xmm3
 	movdqu      [edi], xmm0
@@ -504,12 +504,12 @@
 	movdqu      [edi+ebp], xmm2
 	lea			edi, [edi+2*ebp]
 	lea			esi, [esi+2*eax]
-	lea			edx, [edx+2*ebx]	
+	lea			edx, [edx+2*ebx]
 
 	movdqu      xmm4, [esi]
 	movdqu      xmm5, [edx]
 	movdqu      xmm6, [esi+eax]
-	movdqu      xmm7, [edx+ebx]	
+	movdqu      xmm7, [edx+ebx]
 	pavgb       xmm4, xmm5
 	pavgb       xmm6, xmm7
 	movdqu      [edi], xmm4
@@ -516,11 +516,11 @@
 	movdqu      [edi+ebp], xmm6
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
-    lea         edx, [edx+2*ebx]	
-    
+    lea         edx, [edx+2*ebx]
+
 	dec         ecx
 	jne         .height_loop
-	
+
     pop         edi
     pop         esi
     pop         ebx
@@ -540,7 +540,7 @@
     dec    dword [esp+4]
     jg     avg_w16_align_0_ssse3
     ret
-    
+
     ALIGN 64
 avg_w16_align_1_ssse3:
     movdqa  xmm1, [ebx+16]
@@ -555,7 +555,7 @@
     jg     avg_w16_align_1_ssse3
     ret
 
-  
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq16_ssse3(uint8_t *pDst,  int32_t iDstStride,
@@ -574,7 +574,7 @@
     mov         ebx, [esp+28]       ; src1
     mov         ecx, [esp+36]       ; src2
     mov         esi, [esp+24]       ; i_dst_stride
-    
+
      %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
     mov edx, ebx
     and edx, 0x01
@@ -582,11 +582,11 @@
     lea ebp, [avg_w16_offset]
     imul ebp, edx
     lea edx, [ebp+eax]
-    
-    mov eax, [esp+32]  
-    mov ebp, [esp+44] 
+
+    mov eax, [esp+32]
+    mov ebp, [esp+44]
     push ebp
-    mov ebp, [esp+44]	
+    mov ebp, [esp+44]
     and ebx, 0xfffffff0
     call edx
 	pop		   ebp
@@ -607,7 +607,7 @@
     push    edi
     push    ebx
 
-    
+
     mov esi,  [esp+16]
     mov eax, [esp+20]
     mov edi,  [esp+24]
@@ -617,12 +617,12 @@
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
     pop     edi
     pop     esi
@@ -650,12 +650,12 @@
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
     pop     edi
     pop     esi
     ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
@@ -664,11 +664,11 @@
     push    esi
     push    edi
 
-    mov     esi, [esp+12]       
-    mov     eax, [esp+16]       
-    mov     edi, [esp+20]       
-    mov     edx, [esp+24]       
-    mov     ecx, [esp+28]       
+    mov     esi, [esp+12]
+    mov     eax, [esp+16]
+    mov     edi, [esp+20]
+    mov     edx, [esp+24]
+    mov     ecx, [esp+28]
 
 ALIGN 4
 .height_loop:
@@ -681,7 +681,7 @@
     lea     esi, [esi+eax*2]
     lea     edi, [edi+edx*2]
     jnz     .height_loop
-  
+
     pop     edi
     pop     esi
     ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0
+
+	movq mm0, mm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	esi, [esi+2*edi]
+
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0
+
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4
+
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/encoder/core/asm/mc_luma.asm
+++ b/codec/encoder/core/asm/mc_luma.asm
@@ -91,10 +91,10 @@
 
 ALIGN 16
 ;***********************************************************************
-; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;								int32_t iSrcStride, 
-;								uint8_t *pDst, 
-;								int32_t iDstStride, 
+; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;								int32_t iSrcStride,
+;								uint8_t *pDst,
+;								int32_t iDstStride,
 ;								int32_t iHeight,
 ;                      );
 ;***********************************************************************
@@ -101,19 +101,19 @@
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
 
-	mov esi, [esp + 12]         
-	mov eax, [esp + 16]         
-	mov edi, [esp + 20]         
-	mov ecx, [esp + 28]         
-	mov edx, [esp + 24]			
-	sub esi, 2                  
-	
+
+	mov esi, [esp + 12]
+	mov eax, [esp + 16]
+	mov edi, [esp + 20]
+	mov ecx, [esp + 28]
+	mov edx, [esp + 24]
+	sub esi, 2
+
 	WELS_Zero  xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -126,7 +126,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -152,7 +152,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -165,8 +165,8 @@
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	
+
+
 	add esi, eax
 	add edi, edx
 	dec ecx
@@ -178,9 +178,9 @@
 
 ALIGN 16
 ;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, 
-;									int32_t iSrcStride, 
-;									uint8_t* pTap,	
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
+;									int32_t iSrcStride,
+;									uint8_t* pTap,
 ;									int32_t iTapStride,
 ;									int32_t iHeight);
 ;***********************************************************************
@@ -193,11 +193,11 @@
 	mov edi, [esp+24]		;tap
 	mov edx, [esp+28]	;tap_stride
 	mov ebx, [esp+32]	;i_height
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@@ -211,7 +211,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -221,7 +221,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -230,12 +230,12 @@
 	pop edi
 	pop esi
 	ret
-	
+
 ;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int32_t iDstStride, 
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@@ -242,18 +242,18 @@
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
-	mov esi, [esp + 12]           
-	mov edx, [esp + 16]	          
-	mov edi, [esp + 20]           
-	mov eax, [esp + 24]           
-	mov ecx, [esp + 28]           
 
+	mov esi, [esp + 12]
+	mov edx, [esp + 16]
+	mov edi, [esp + 20]
+	mov eax, [esp + 24]
+	mov ecx, [esp + 28]
+
 	sub esi, edx
 	sub esi, edx
 
 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -262,8 +262,8 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@@ -273,7 +273,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -356,11 +356,11 @@
 
 
 ;***********************************************************************
-; void McHorVer02_sse2(	uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
+; void McHorVer02_sse2(	uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
 ;                       int32_t iDstStride,
-;						int32_t iWidth, 
+;						int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@@ -368,19 +368,19 @@
 	push esi
 	push edi
 	push ebx
-	
-	mov esi, [esp + 16]           
-	mov edx, [esp + 20]	          
-	mov edi, [esp + 24]           
-	mov eax, [esp + 28]           
-	mov ecx, [esp + 36]           
-	mov ebx, [esp + 32]			  
+
+	mov esi, [esp + 16]
+	mov edx, [esp + 20]
+	mov edi, [esp + 24]
+	mov eax, [esp + 28]
+	mov ecx, [esp + 36]
+	mov ebx, [esp + 32]
 	shr ebx, 3
 	sub esi, edx
 	sub esi, edx
-	
-.xloop:	
-	WELS_Zero xmm7			
+
+.xloop:
+	WELS_Zero xmm7
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -389,7 +389,7 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
+
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*edx]
@@ -402,8 +402,8 @@
 	movdqa xmm5,xmm6
 	add edi, eax
 	sub esi, edx
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
@@ -413,7 +413,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -454,16 +454,16 @@
 	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
 	jmp near .start
 
-.x_loop_dec:	
+.x_loop_dec:
 	dec ebx
 	jz  near .xx_exit
-	mov esi, [esp + 16]           
-	mov edi, [esp + 24]           
+	mov esi, [esp + 16]
+	mov edi, [esp + 24]
 	sub esi, edx
 	sub esi, edx
 	add esi, 8
 	add edi, 8
-	mov ecx, [esp + 36] 
+	mov ecx, [esp + 36]
 	jmp near .xloop
 
 .xx_exit:
@@ -473,12 +473,12 @@
 	ret
 
 
-ALIGN 16                  
+ALIGN 16
 ;***********************************************************************
-; void McHorVer20_sse2(		uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
+; void McHorVer20_sse2(		uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
 ;						int32_t iWidth,
 ;						int32_t iHeight
 ;                      );
@@ -487,19 +487,19 @@
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
 	sub esi, 2
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -512,7 +512,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -526,12 +526,12 @@
 	paddw xmm0, [h264_w0x10_1]
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -543,8 +543,8 @@
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+1], xmm2	
-		
+	movq [edi+1], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -553,8 +553,8 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@@ -569,7 +569,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -582,7 +582,7 @@
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movq [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@@ -595,7 +595,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -610,12 +610,12 @@
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movd [edi+8], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -627,7 +627,7 @@
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+9], xmm2		
+	movq [edi+9], xmm2
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -636,14 +636,14 @@
 	pop edi
 	pop esi
 	ret
-	
-	
 
+
+
 ALIGN 16
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
-;							(uint8_t *pSrc, 
-;							int32_t iSrcStride, 
+;							(uint8_t *pSrc,
+;							int32_t iSrcStride,
 ;							uint8_t * pTap,
 ;							int32_t iTapStride,
 ;							int32_t iWidth,int32_t iHeight);
@@ -652,21 +652,21 @@
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
-	pxor xmm7, xmm7	
-	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-	
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -679,7 +679,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -690,12 +690,12 @@
 	paddw xmm0, xmm6
 	psllw xmm6, 2
 	paddw xmm0, xmm6
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -704,9 +704,9 @@
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+2], xmm2	
-	movhps [edi+2+8], xmm2	
-	
+	movq [edi+2], xmm2
+	movhps [edi+2+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -715,8 +715,8 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@@ -731,7 +731,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -741,7 +741,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@@ -754,7 +754,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -766,12 +766,12 @@
 	psllw xmm6, 2
 	paddw xmm0, xmm6
 	movd [edi+16], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -780,9 +780,9 @@
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+18], xmm2	
-	movhps [edi+18+8], xmm2	
-	
+	movq [edi+18], xmm2
+	movhps [edi+18+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -791,23 +791,23 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 %macro FILTER_VER 9
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@@ -815,8 +815,8 @@
 %endmacro
 ;***********************************************************************
 ;void McHorVer22VerLastAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -828,15 +828,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@@ -846,12 +846,12 @@
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -858,61 +858,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -922,9 +922,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@@ -934,8 +934,8 @@
 
 ;***********************************************************************
 ;void McHorVer22VerLastUnAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -947,15 +947,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqu xmm0, [esi]
 	movdqu xmm1, [esi+eax]
@@ -965,12 +965,12 @@
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -977,61 +977,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -1041,9 +1041,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -47,8 +47,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -69,7 +69,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@@ -77,12 +77,12 @@
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@@ -102,16 +102,16 @@
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -119,17 +119,17 @@
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -44,17 +44,17 @@
 
 BITS 32
 
-SECTION .text	
+SECTION .text
 ;************************************************
-;NEW_QUANT 
+;NEW_QUANT
 ;************************************************
 
 %macro SSE2_Quant8  5
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@@ -64,10 +64,10 @@
 
 %macro SSE2_QuantMax8  6
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2								
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pmaxsw	%6, %1
@@ -86,17 +86,17 @@
 WELS_EXTERN WelsQuant4x4_sse2
 align 16
 WelsQuant4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
+
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 
 		ret
-	
+
 ;***********************************************************************
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
@@ -104,17 +104,17 @@
 align 16
 WelsQuant4x4Dc_sse2:
 		mov		ax,		[mf]
-		SSE2_Copy8Times xmm3, eax						
-		
+		SSE2_Copy8Times xmm3, eax
+
 		mov		cx, [ff]
-		SSE2_Copy8Times xmm2, ecx						
+		SSE2_Copy8Times xmm2, ecx
 
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-				
-		ret		
-		
+
+		ret
+
 ;***********************************************************************
 ;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
@@ -121,20 +121,20 @@
 WELS_EXTERN WelsQuantFour4x4_sse2
 align 16
 WelsQuantFour4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]	
+
+		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
 
 		ret
 
@@ -144,17 +144,17 @@
 WELS_EXTERN WelsQuantFour4x4Max_sse2
 align 16
 WelsQuantFour4x4Max_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]		
+
+		mov		edx,  [pDct]
 		pxor	xmm4, xmm4
 		pxor	xmm5, xmm5
 		pxor	xmm6, xmm6
 		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4		
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
@@ -162,20 +162,20 @@
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
-		
+
 		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4	
+		pmaxsw  xmm0,  xmm4
 		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7			
+		pmaxsw  xmm0,  xmm7
 		movdqa	xmm1,  xmm0
 		punpckhqdq	xmm0, xmm1
 		pmaxsw	xmm0, xmm1
 
-		mov		edx,  [max]	
-		movq	[edx], xmm0	
-			
-		ret		
+		mov		edx,  [max]
+		movq	[edx], xmm0
 
+		ret
+
 %macro  MMX_Copy4Times 2
 		movd		%1, %2
 		punpcklwd	%1, %1
@@ -185,10 +185,10 @@
 SECTION .text
 
 %macro MMX_Quant4  4
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@@ -211,13 +211,13 @@
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		mov			cx,				0
 		mov			[eax],			cx
 		mov			[eax + 0x20],	cx
 		mov			[eax + 0x40],	cx
 		mov			[eax + 0x60],	cx
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -229,22 +229,22 @@
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; store dct_2x2
-		mov			edx,			[dct2x2]	
+		mov			edx,			[dct2x2]
 		movq		[edx],			mm1
 		mov			ecx,			[iChromaDc]
 		movq		[ecx],			mm1
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
@@ -251,10 +251,10 @@
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-					
+
 		WELSEMMS
 		ret
-	
+
 ;***********************************************************************
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
@@ -269,7 +269,7 @@
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -281,16 +281,16 @@
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
@@ -297,16 +297,16 @@
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-			
-		WELSEMMS		
-		ret	
-		
-		
-%macro SSE2_DeQuant8 3  
+
+		WELSEMMS
+		ret
+
+
+%macro SSE2_DeQuant8 3
     MOVDQ  %2, %1
     pmullw %2, %3
     MOVDQ  %1, %2
-%endmacro 
+%endmacro
 
 
 ALIGN  16
@@ -329,7 +329,7 @@
 ;***********************************************************************====
 ;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************====
-    
+
 align 16
 
 WELS_EXTERN WelsDequantFour4x4_sse2
@@ -356,15 +356,15 @@
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
 align 16
 WelsDequantIHadamard4x4_sse2:
-		mov			eax,			[esp + 4]				
+		mov			eax,			[esp + 4]
 		mov			cx,				[esp + 8]
-		
+
 		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		ecx		
+		SSE2_Copy8Times	xmm1,		ecx
 		;psrlw		xmm1,		2		; for the (>>2) in ihdm
 		MOVDQ		xmm0,		[eax]
 		MOVDQ		xmm2,		[eax+0x10]
-		pmullw		xmm0,		xmm1		
+		pmullw		xmm0,		xmm1
 		pmullw		xmm2,		xmm1
 
 		; ihdm_4x4
@@ -371,24 +371,23 @@
 		movdqa		xmm1,		xmm0
 		psrldq		xmm1,		8
 		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8		
-		
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3	
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2														
+		psrldq		xmm3,		8
+
+		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
 		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
 		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
 
-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4		
-		SSE2_SumSub		xmm2, xmm4,	xmm5		
-		SSE2_SumSub		xmm1, xmm0, xmm5																		
-		SSE2_SumSub		xmm4, xmm0, xmm5							
-		SSE2_SumSub		xmm2, xmm1, xmm5 
+		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
+		SSE2_SumSub		xmm2, xmm4,	xmm5
+		SSE2_SumSub		xmm1, xmm0, xmm5
+		SSE2_SumSub		xmm4, xmm0, xmm5
+		SSE2_SumSub		xmm2, xmm1, xmm5
 		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-		
+
 		punpcklqdq	xmm0,		xmm1
 		MOVDQ		[eax],		xmm0
-		
+
 		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[eax+16],	xmm2			
+		MOVDQ		[eax+16],	xmm2
 		ret
-	
\ No newline at end of file
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -37,7 +37,7 @@
 ;*      WelsSampleSatd16x8_sse2
 ;*      WelsSampleSatd8x16_sse2
 ;*      WelsSampleSatd16x16_sse2
-;*      
+;*
 ;*      WelsSampleSad16x8_sse2
 ;*      WelsSampleSad16x16_sse2
 ;*
@@ -99,12 +99,12 @@
 
 %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
    SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5 
-   SSE2_SumSub %2, %4, %5 
-   SSE2_SumSub %1, %3, %5 
-%endmacro 
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
 
-%macro SSE2_SumAbs4 7  
+%macro SSE2_SumAbs4 7
 	WELS_AbsW %1, %3
 	WELS_AbsW %2, %3
 	WELS_AbsW %4, %6
@@ -113,13 +113,13 @@
 	paddusw       %4, %5
 	paddusw       %7, %1
 	paddusw       %7, %4
-%endmacro 
+%endmacro
 
 %macro  SSE2_SumWHorizon 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -132,12 +132,12 @@
 	lea                 ecx, [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
 	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	
+
 	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
 	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
 	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-	
+
 	lea					eax,    [eax+2*ebx]
     lea					ecx,    [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
@@ -146,11 +146,11 @@
 	lea                 ecx, [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
 	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	
+
 	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
 	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6	
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 %endmacro
 
 ;***********************************************************************
@@ -165,8 +165,8 @@
 	mov       eax,  [esp+8]
 	mov       ebx,  [esp+12]
 	mov       ecx,  [esp+16]
-	mov       edx,  [esp+20]    
-	
+	mov       edx,  [esp+20]
+
     movd      xmm0, [eax]
     movd      xmm1, [eax+ebx]
     lea       eax , [eax+2*ebx]
@@ -174,7 +174,7 @@
     movd      xmm3, [eax+ebx]
     punpckldq xmm0, xmm2
     punpckldq xmm1, xmm3
-   
+
     movd      xmm4, [ecx]
     movd      xmm5, [ecx+edx]
     lea       ecx , [ecx+2*edx]
@@ -188,7 +188,7 @@
     punpcklbw xmm1, xmm6
     punpcklbw xmm4, xmm6
     punpcklbw xmm5, xmm6
-    
+
     psubw     xmm0, xmm4
     psubw     xmm1, xmm5
 
@@ -196,7 +196,7 @@
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
     SSE2_XSawp qdq, xmm0, xmm2, xmm3
-    
+
     movdqa     xmm4, xmm0
     paddw      xmm0, xmm3
     psubw      xmm4, xmm3
@@ -204,7 +204,7 @@
     movdqa         xmm2, xmm0
     punpcklwd      xmm0, xmm4
     punpckhwd      xmm4, xmm2
-    
+
 	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
 	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
 
@@ -211,16 +211,16 @@
     movdqa         xmm7, xmm0
     paddw          xmm0, xmm5
     psubw          xmm7, xmm5
-    
+
 	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
 
     movdqa         xmm2, xmm0
     paddw          xmm0, xmm1
     psubw          xmm2, xmm1
-    
-    WELS_AbsW  xmm0, xmm3   
+
+    WELS_AbsW  xmm0, xmm3
     paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4   
+	WELS_AbsW  xmm2, xmm4
     paddusw        xmm6, xmm2
     SSE2_SumWHorizon1  xmm6, xmm4
 	movd           eax,  xmm6
@@ -228,7 +228,7 @@
     shr            eax,  1
 	pop            ebx
 	ret
- 
+
  ;***********************************************************************
  ;
  ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -241,16 +241,16 @@
 	 mov    eax,    [esp+8]
 	 mov    ebx,    [esp+12]
 	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]    
+	 mov    edx,    [esp+20]
 	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7     
-     SSE2_GetSatd8x8	 
+     pxor   xmm7,   xmm7
+     SSE2_GetSatd8x8
      psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    eax,   xmm6
 	 pop     ebx
 	 ret
- 
+
  ;***********************************************************************
  ;
  ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -263,15 +263,15 @@
 	 mov    eax,    [esp+8]
 	 mov    ebx,    [esp+12]
 	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]    
+	 mov    edx,    [esp+20]
 	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7  
-        
-	 SSE2_GetSatd8x8	 
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
      lea    eax,    [eax+2*ebx]
-     lea    ecx,    [ecx+2*edx]     
-	 SSE2_GetSatd8x8	
-	  
+     lea    ecx,    [ecx+2*edx]
+	 SSE2_GetSatd8x8
+
 	 psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    eax,   xmm6
@@ -290,15 +290,15 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
-    
+
 	SSE2_GetSatd8x8
 	mov    eax,    [esp+8]
     mov    ecx,    [esp+16]
     add    eax,    8
-    add    ecx,    8    
+    add    ecx,    8
 	SSE2_GetSatd8x8
 
 	psrlw   xmm6,  1
@@ -319,25 +319,25 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
-    
-	SSE2_GetSatd8x8		
+
+	SSE2_GetSatd8x8
 	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]	
+	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSatd8x8
-	
+
 	mov    eax,    [esp+8]
 	mov    ecx,    [esp+16]
 	add    eax,    8
 	add    ecx,    8
-	
-	SSE2_GetSatd8x8	
+
+	SSE2_GetSatd8x8
 	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]	
+	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSatd8x8
-	
+
  ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
@@ -353,18 +353,18 @@
 
 ;***********************************************************************
 ;
-;Pixel_satd_intra_sse2 BEGIN 
+;Pixel_satd_intra_sse2 BEGIN
 ;
 ;***********************************************************************
 
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
 	pmaddubsw    %1, xmm5
 	movdqa       %2, %1
 	pmaddwd      %1, xmm7
 	pmaddwd      %2, xmm6
 	movdqa       %3, %1
-	punpckldq    %1, %2 
-	punpckhdq    %2, %3 
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
 	movdqa       %3, %1
 	punpcklqdq   %1, %2
 	punpckhqdq   %3, %2
@@ -373,14 +373,14 @@
 	packssdw     %1, %3
 	psllw        %1, 2
 %endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2  
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
 	pmaddubsw    %1, xmm5
 	movdqa       %2, %1
 	pmaddwd      %1, xmm7
 	pmaddwd      %2, xmm6
 	movdqa       %3, %1
-	punpckldq    %1, %2 
-	punpckhdq    %2, %3 
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
 	movdqa       %3, %1
 	punpcklqdq   %1, %2
 	punpckhqdq   %3, %2
@@ -387,7 +387,7 @@
 ;    paddd        xmm4, %1 ;for dc
 ;	 paddd        xmm4, %3 ;for dc
 	movdqa       %4, %1
-	punpcklqdq   %4, %3 
+	punpcklqdq   %4, %3
 	packssdw     %1, %3
 	psllw        %1, 2
 %endmacro
@@ -415,25 +415,25 @@
 	pinsrw      xmm0,   word[esi+%2+8], 4
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+2],  0
 	pinsrw      xmm0,   word[esi+%2+10], 4
 	psubsw      xmm0,   xmm1
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+4],  0
 	pinsrw      xmm0,   word[esi+%2+12], 4
 	psubsw      xmm0,   xmm3
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+6],  0
 	pinsrw      xmm0,   word[esi+%2+14], 4
 	psubsw      xmm0,   xmm2
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 %endmacro
 %macro SSE41_GetX38x4SatdH  3
 	movq        xmm0,   [esi+%3+8*%1]
@@ -455,7 +455,7 @@
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
 	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2 
+	paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_ChromaGetX38x4SatdDC 1
 	shl         %1,     4
@@ -463,13 +463,13 @@
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
 	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2 
+	paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_I16x16GetX38x4Satd 2
 	SSE41_GetX38x4SatdDec
 	SSE41_GetX38x4SatdV   %1, %2
 	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC 
+	SSE41_I16X16GetX38x4SatdDC
 %endmacro
 %macro SSE41_ChromaGetX38x4Satd 2
 	SSE41_GetX38x4SatdDec
@@ -478,11 +478,11 @@
 	SSE41_ChromaGetX38x4SatdDC %1
 %endmacro
 %macro SSE41_HSum8W 3
-	pmaddwd     %1, %2 
-	movhlps     %3, %1 
-	paddd       %1, %3 
-	pshuflw     %3, %1,0Eh 
-	paddd       %1, %3 
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
 %endmacro
 
 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
@@ -493,7 +493,7 @@
 	mov    ecx,    [esp+16]
 	mov    edx,    [esp+20]
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
 	mov    esi,    [esp+40] ;temp_satd
 	pxor        xmm4,   xmm4
 	movdqa      xmm5,   [HSumSubDB1]
@@ -507,29 +507,29 @@
 	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
 	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
 	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1 
+	movdqa      [esi+16], xmm1
 	add         ecx,    edx
 	pinsrb      xmm0,   byte[ecx-1], 0
 	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     2
 	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     4
 	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     6
 	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     8
 	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     10
 	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     12
 	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     14
 	pinsrb      xmm0,   byte[ecx+edx-1], 15
 	movhlps		xmm1,   xmm0
@@ -549,7 +549,7 @@
 	pxor        xmm6,   xmm6 ;DC
 	mov         ecx,    0
 	mov         edi,    0
-.loop16x16_get_satd:    
+.loop16x16_get_satd:
 .loopStart1:
 	SSE41_I16x16GetX38x4Satd ecx, edi
 	inc          ecx
@@ -562,8 +562,8 @@
 	mov         ecx, 0
 	add         edi, 16
 	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:   
-	MMX_DW_1_2REG    xmm0, xmm1 
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
 	psrlw       xmm4, 1 ;/2
 	psrlw       xmm5, 1 ;/2
 	psrlw       xmm6, 1 ;/2
@@ -570,7 +570,7 @@
 	SSE41_HSum8W     xmm4, xmm0, xmm1
 	SSE41_HSum8W     xmm5, xmm0, xmm1
 	SSE41_HSum8W     xmm6, xmm0, xmm1
-	
+
 	; comparing order: DC H V
 	movd      ebx, xmm6 ;DC
 	movd      edi, xmm5 ;H
@@ -577,33 +577,33 @@
 	movd      ecx, xmm4 ;V
 	mov      edx, [esp+36]
 	shl       edx, 1
-	add       edi, edx 
-	add       ebx, edx 
+	add       edi, edx
+	add       ebx, edx
 	mov       edx, [esp+32]
 	cmp       ebx, edi
 	jge near   not_dc_16x16
 	cmp        ebx, ecx
 	jge near   not_dc_h_16x16
-	
+
 	; for DC mode
 	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
 	jmp near return_satd_intra_16x16_x3
 not_dc_16x16:
-	; for H mode 
+	; for H mode
 	cmp       edi, ecx
 	jge near   not_dc_h_16x16
 	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi 
+	mov       eax, edi
 	jmp near return_satd_intra_16x16_x3
 not_dc_h_16x16:
 	; for V mode
 	mov       dword[edx], 0;I16_PRED_V
 	mov       eax, ecx
-return_satd_intra_16x16_x3: 
+return_satd_intra_16x16_x3:
 	WELSEMMS
-	pop         edi 
-	pop         esi 
+	pop         edi
+	pop         esi
 	pop         ebx
 ret
 
@@ -619,13 +619,13 @@
 	add         ecx,    edx
 	pinsrb      xmm0,   byte[ecx-1], 0
 	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     2
 	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     4
 	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     6
 	pinsrb      xmm0,   byte[ecx+edx-1], 7
 	punpcklqdq  xmm0,   xmm0
@@ -634,10 +634,10 @@
 ;(sum+2)>>2
 	movdqa      xmm6,   [PDQ2]
 	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1    
+	punpckhqdq  xmm5,   xmm1
 	paddd       xmm5,   xmm6
 	psrld       xmm5,   2
-;(sum1+sum2+4)>>3   
+;(sum1+sum2+4)>>3
 	paddd       xmm6,   xmm6
 	paddd       xmm4,   xmm1
 	paddd       xmm4,   xmm6
@@ -644,8 +644,8 @@
 	psrld       xmm4,   3
 ;satd *16
 	pslld       xmm5,   4
-	pslld       xmm4,   4    
-;temp satd    
+	pslld       xmm4,   4
+;temp satd
 	movdqa      xmm6,   xmm4
 	punpcklqdq  xmm4,   xmm5
 	psllq       xmm4,   32
@@ -655,12 +655,12 @@
 	psllq       xmm5,   32
 	psrlq       xmm5,   32
 	movdqa      [esi+48], xmm5
-	
+
 	pxor        xmm4,   xmm4 ;V
 	pxor        xmm5,   xmm5 ;H
 	pxor        xmm6,   xmm6 ;DC
 	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:    
+loop_chroma_satdx3_cb_cr:
 	SSE41_ChromaGetX38x4Satd ecx, 0
 	inc             ecx
 	cmp             ecx, 2
@@ -668,13 +668,13 @@
 %endmacro
 
 %macro SSEReg2MMX 3
-	movdq2q     %2, %1 
-	movhlps     %1, %1 
-	movdq2q     %3, %1 
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
 %endmacro
 %macro MMXReg2SSE 4
-	movq2dq     %1, %3 
-	movq2dq     %2, %4 
+	movq2dq     %1, %3
+	movq2dq     %2, %4
 	punpcklqdq  %1, %2
 %endmacro
 ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
@@ -687,10 +687,10 @@
 	mov    ecx,    [esp+16]
 	mov    edx,    [esp+20]
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
 	mov    esi,    [esp+40] ;temp_satd
 	xor    edi,    edi
-loop_chroma_satdx3: 
+loop_chroma_satdx3:
 	SSE41_ChromaGetX38x8Satd
 	cmp             edi, 1
 	je              loop_chroma_satdx3end
@@ -701,16 +701,16 @@
 	mov         ecx,  [esp+44]
 	mov         eax,  [esp+48]
 	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:    
+loop_chroma_satdx3end:
 	MMXReg2SSE  xmm0, xmm3, mm0, mm1
 	MMXReg2SSE  xmm1, xmm3, mm2, mm3
 	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-	
+
 	paddw       xmm4, xmm0
 	paddw       xmm5, xmm1
 	paddw       xmm6, xmm2
-	
-	MMX_DW_1_2REG    xmm0, xmm1 
+
+	MMX_DW_1_2REG    xmm0, xmm1
 	psrlw       xmm4, 1 ;/2
 	psrlw       xmm5, 1 ;/2
 	psrlw       xmm6, 1 ;/2
@@ -730,57 +730,57 @@
 	jge near   not_dc_8x8
 	cmp        ebx, ecx
 	jge near   not_dc_h_8x8
-	
+
 	; for DC mode
 	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
 	jmp near return_satd_intra_8x8_x3
 not_dc_8x8:
-	; for H mode 
+	; for H mode
 	cmp       edi, ecx
 	jge near   not_dc_h_8x8
 	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi 
+	mov       eax, edi
 	jmp near return_satd_intra_8x8_x3
 not_dc_h_8x8:
 	; for V mode
 	mov       dword[edx], 2;I8_PRED_V
 	mov       eax, ecx
-return_satd_intra_8x8_x3: 
+return_satd_intra_8x8_x3:
 	WELSEMMS
-	pop         edi 
-	pop         esi 
+	pop         edi
+	pop         esi
 	pop         ebx
 ret
 
-	
+
 ;***********************************************************************
 ;
-;Pixel_satd_intra_sse2 END 
+;Pixel_satd_intra_sse2 END
 ;
 ;***********************************************************************
 %macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1 
-  pshufb      xmm6,xmm1 
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
   movdqa      %1,  xmm6
-  movdqa      xmm0,%2 
-  psadbw      xmm0,xmm7 
-  paddw       xmm4,xmm0 
   movdqa      xmm0,%2
-  psadbw      xmm0,xmm5 
-  paddw       xmm2,xmm0 
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
   psadbw      xmm6,%2
-  paddw       xmm3,xmm6 
+  paddw       xmm3,xmm6
 %endmacro
 %macro WelsAddDCValue 4
     movzx   %2, byte %1
-    mov    %3, %2 
+    mov    %3, %2
     add     %4, %2
-%endmacro   
+%endmacro
 
 ;***********************************************************************
 ;
-;Pixel_sad_intra_ssse3 BEGIN 
+;Pixel_sad_intra_ssse3 BEGIN
 ;
 ;***********************************************************************
 WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
@@ -792,14 +792,14 @@
 	mov    edx,    [esp+20]
 	mov    edi,    [esp+40] ;temp_sad
 	sub    ecx,    edx
-    movdqa      xmm5,[ecx] 
+    movdqa      xmm5,[ecx]
     pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5 
-    movhlps     xmm1,xmm0 
-    paddw       xmm0,xmm1 
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
     movd        eax,xmm0
-     
-    add         ecx,edx 
+
+    add         ecx,edx
     lea         ebx, [edx+2*edx]
     WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
     WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
@@ -824,45 +824,45 @@
     WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
     WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
     sub        edi, 192
-    add         eax,10h 
-    shr         eax,5 
-    movd        xmm7,eax 
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
     pxor        xmm1,xmm1
     pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4 
-    pxor        xmm3,xmm3 
-    pxor        xmm2,xmm2 
-;sad begin  
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
     lea         esi, [ebx+2*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    
-    pslldq      xmm3,4 
-    por         xmm3,xmm2 
-    movhlps     xmm1,xmm3 
-    paddw       xmm3,xmm1 
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
     movhlps     xmm0,xmm4
     paddw       xmm4,xmm0
 ; comparing order: DC H V
@@ -872,8 +872,8 @@
 	movd        esi, xmm3 ;H
 	mov         eax, [esp+36] ;lamda
 	shl         eax, 1
-	add         esi, eax 
-	add         ebx, eax 
+	add         esi, eax
+	add         ebx, eax
 	mov         edx, [esp+32]
 	cmp         ebx, esi
 	jge near   not_dc_16x16_sad
@@ -881,7 +881,7 @@
 	jge near   not_dc_h_16x16_sad
 	; for DC mode
 	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
     sub        edi, 192
 %assign x 0
 %rep 16
@@ -890,11 +890,11 @@
 %endrep
 	jmp near return_sad_intra_16x16_x3
 not_dc_16x16_sad:
-	; for H mode 
+	; for H mode
 	cmp       esi, ecx
 	jge near   not_dc_h_16x16_sad
 	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi 
+	mov       eax, esi
 	jmp near return_sad_intra_16x16_x3
 not_dc_h_16x16_sad:
 	; for V mode
@@ -914,12 +914,12 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_intra_ssse3 END 
+;Pixel_sad_intra_ssse3 END
 ;
 ;***********************************************************************
 ;***********************************************************************
 ;
-;Pixel_satd_wxh_sse41 BEGIN 
+;Pixel_satd_wxh_sse41 BEGIN
 ;
 ;***********************************************************************
 
@@ -934,9 +934,9 @@
 	movq             xmm2, [ecx]
 	punpcklqdq       xmm2, xmm2
 	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [ecx+edx]	
-	punpcklqdq       xmm3, xmm3	
-	pmaddubsw        xmm3, xmm7	
+	movq             xmm3, [ecx+edx]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
 	psubsw           xmm0, xmm2
 	psubsw           xmm1, xmm3
 	movq             xmm2, [eax+2*ebx]
@@ -948,12 +948,12 @@
 	movq             xmm4, [ecx+2*edx]
 	punpcklqdq       xmm4, xmm4
 	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [ecx+edi]	
-	punpcklqdq       xmm5, xmm5	
+	movq             xmm5, [ecx+edi]
+	punpcklqdq       xmm5, xmm5
 	pmaddubsw        xmm5, xmm7
 	psubsw           xmm2, xmm4
 	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4	
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
 	pabsw            xmm0, xmm0
 	pabsw            xmm2, xmm2
 	pabsw            xmm1, xmm1
@@ -970,18 +970,18 @@
 	pslld            xmm2, 16
 	psrld            xmm4, 16
 	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2	
+	pmaxuw           xmm0, xmm2
 	paddw            xmm6, xmm0
 %endmacro
 
 %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4 
-	pmaddwd     %2, %3 
-	movhlps     %4, %2 
-	paddd       %2, %4 
-	pshuflw     %4, %2,0Eh 
-	paddd       %2, %4 
-	movd		%1, %2 
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
 %endmacro
 ;***********************************************************************
 ;
@@ -990,53 +990,53 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
 WelsSampleSatd4x4_sse41:
-	push        ebx  
-	mov         eax,[esp+8] 
-	mov         ebx,[esp+12] 
-	mov         ecx,[esp+16] 
-	mov         edx,[esp+20] 
-	movdqa      xmm4,[HSwapSumSubDB1] 
-	movd        xmm2,[ecx] 
-	movd        xmm5,[ecx+edx] 
-	shufps      xmm2,xmm5,0 
-	movd        xmm3,[ecx+edx*2] 
+	push        ebx
+	mov         eax,[esp+8]
+	mov         ebx,[esp+12]
+	mov         ecx,[esp+16]
+	mov         edx,[esp+20]
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[ecx]
+	movd        xmm5,[ecx+edx]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[ecx+edx*2]
 	lea         ecx, [edx*2+ecx]
-	movd        xmm5,[ecx+edx] 
-	shufps      xmm3,xmm5,0 
-	movd        xmm0,[eax] 
-	movd        xmm5,[eax+ebx] 
-	shufps      xmm0,xmm5,0 
-	movd        xmm1,[eax+ebx*2] 
+	movd        xmm5,[ecx+edx]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[eax]
+	movd        xmm5,[eax+ebx]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[eax+ebx*2]
 	lea         eax, [ebx*2+eax]
-	movd        xmm5,[eax+ebx] 
-	shufps      xmm1,xmm5,0 
-	pmaddubsw   xmm0,xmm4 
-	pmaddubsw   xmm1,xmm4 
-	pmaddubsw   xmm2,xmm4 
-	pmaddubsw   xmm3,xmm4 
-	psubw       xmm0,xmm2 
-	psubw       xmm1,xmm3 
-	movdqa      xmm2,xmm0 
-	paddw       xmm0,xmm1 
-	psubw       xmm1,xmm2 
-	movdqa      xmm2,xmm0 
-	punpcklqdq  xmm0,xmm1 
-	punpckhqdq  xmm2,xmm1 
-	movdqa      xmm1,xmm0 
-	paddw       xmm0,xmm2 
-	psubw       xmm2,xmm1 
-	movdqa      xmm1,xmm0 
-	pblendw     xmm0,xmm2,0AAh 
-	pslld       xmm2,16 
-	psrld       xmm1,16 
-	por         xmm2,xmm1 
-	pabsw       xmm0,xmm0 
-	pabsw       xmm2,xmm2 
-	pmaxsw      xmm0,xmm2 
+	movd        xmm5,[eax+ebx]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
 	SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
-	pop         ebx  
-	ret 
- 
+	pop         ebx
+	ret
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1051,10 +1051,10 @@
 	mov    eax,    [esp+16]
 	mov    ebx,    [esp+20]
 	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]    
+	mov    edx,    [esp+28]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6, xmm6
 	SSE41_GetSatd8x4
 	lea			eax,	[eax+4*ebx]
@@ -1065,7 +1065,7 @@
 	pop 		esi
 	pop 		ebx
 	ret
- 
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1078,17 +1078,17 @@
 	push   esi
 	push   edi
 	push   ebp
-%define pushsize   16	
+%define pushsize   16
 	mov    eax,    [esp+pushsize+4]
 	mov    ebx,    [esp+pushsize+8]
 	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]    
+	mov    edx,    [esp+pushsize+16]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor        xmm6, xmm6
 	mov         ebp,    0
-loop_get_satd_8x16:	
+loop_get_satd_8x16:
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
 	lea			ecx,  [ecx+4*edx]
@@ -1116,10 +1116,10 @@
 	mov    eax,    [esp+16]
 	mov    ebx,    [esp+20]
 	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]    
+	mov    edx,    [esp+28]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6,   xmm6
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
@@ -1144,7 +1144,7 @@
 ;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
-   
+
 WELS_EXTERN WelsSampleSatd16x16_sse41
 align 16
 WelsSampleSatd16x16_sse41:
@@ -1152,17 +1152,17 @@
 	push   esi
 	push   edi
 	push   ebp
-	%define pushsize   16	
+	%define pushsize   16
 	mov    eax,    [esp+pushsize+4]
 	mov    ebx,    [esp+pushsize+8]
 	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]    
+	mov    edx,    [esp+pushsize+16]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6,   xmm6
 	mov         ebp,    0
-loop_get_satd_16x16_left:	
+loop_get_satd_16x16_left:
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
 	lea			ecx,  [ecx+4*edx]
@@ -1206,8 +1206,8 @@
 	lea    ecx,    [ecx+2*edx]
 	movdqu xmm1,   [ecx]
 	MOVDQ  xmm2,   [eax];[eax] must aligned 16
-	psadbw xmm1,   xmm2 
-	paddw  xmm0,   xmm1	
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
 	psadbw xmm1,   xmm2
@@ -1218,7 +1218,7 @@
 %macro SSE2_GetSad4x16 0
 	movdqu xmm0,   [ecx]
 	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2 
+	psadbw xmm0,   xmm2
 	paddw  xmm7,   xmm0
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
@@ -1226,8 +1226,8 @@
 	paddw  xmm7,   xmm1
 	movdqu xmm1,   [ecx+2*edx]
 	MOVDQ  xmm2,   [eax+2*ebx];[eax] must aligned 16
-	psadbw xmm1,   xmm2 
-	paddw  xmm7,   xmm1	
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
 	movdqu xmm1,   [ecx+edi]
 	MOVDQ  xmm2,   [eax+esi]
 	psadbw xmm1,   xmm2
@@ -1265,17 +1265,17 @@
 WelsSampleSad16x16_sse2:
 	push ebx
 	push edi
-	push esi	
-		
+	push esi
+
 	%define _STACK_SIZE		12
-	
+
 	mov eax, [esp+_STACK_SIZE+4 ]
 	mov	ebx, [esp+_STACK_SIZE+8 ]
 	lea esi, [3*ebx]
 	mov ecx, [esp+_STACK_SIZE+12]
-	mov edx, [esp+_STACK_SIZE+16]	
-	lea edi, [3*edx]	
-	
+	mov edx, [esp+_STACK_SIZE+16]
+	lea edi, [3*edx]
+
 	pxor   xmm7,   xmm7
 	SSE2_GetSad4x16
 	lea   eax,    [eax+4*ebx]
@@ -1290,14 +1290,14 @@
 	movhlps xmm0, xmm7
 	paddw xmm0, xmm7
 	movd eax, xmm0
-	
-	%undef _STACK_SIZE	
-	
+
+	%undef _STACK_SIZE
+
 	pop esi
 	pop edi
 	pop ebx
 	ret
-   
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
@@ -1312,10 +1312,10 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	movdqu xmm0,   [ecx]
 	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2 
+	psadbw xmm0,   xmm2
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
 	psadbw xmm1,   xmm2
@@ -1339,19 +1339,19 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
     pxor   xmm6,   xmm6
-	
+
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
 
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
@@ -1375,15 +1375,15 @@
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
     pxor   xmm7,   xmm7
-    
+
     mov    edi,    ecx
     and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
     mov    edx,    8
     sub    edx,    edi
-    
+
     shl    edi,    3
     shl    edx,    3
     movd   xmm5,   edi
@@ -1391,10 +1391,10 @@
 	mov    edi,    8
 	add    edi,    ecx
     mov    edx,    [esp+24]
-    
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1402,17 +1402,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1420,7 +1420,7 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
 
@@ -1427,10 +1427,10 @@
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1438,17 +1438,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1456,10 +1456,10 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@@ -1469,12 +1469,12 @@
     push   ebx
     mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
@@ -1485,7 +1485,7 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_wxh_sse2 END 
+;Pixel_sad_wxh_sse2 END
 ;
 ;***********************************************************************
 
@@ -1492,7 +1492,7 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_4_wxh_sse2 BEGIN 
+;Pixel_sad_4_wxh_sse2 BEGIN
 ;
 ;***********************************************************************
 
@@ -1525,20 +1525,20 @@
 	movdqu xmm3,   [ecx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movdqa xmm1,   [eax+ebx]
 	movdqu xmm3,   [ecx+edx]
 	psadbw xmm3,   xmm1
 	paddw  xmm4,   xmm3
-	
+
 	movdqu xmm2,   [ecx+edx-1]
 	psadbw xmm2,   xmm0
 	paddw  xmm6,   xmm2
-	
+
 	movdqu xmm3,   [ecx+edx+1]
 	psadbw xmm3,   xmm0
 	paddw  xmm7,   xmm3
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	movdqa xmm2,   [eax]
@@ -1599,30 +1599,30 @@
 	movdqu xmm3,   [ecx]
 	psadbw xmm2,   xmm3
 	paddw xmm5,   xmm2
-	
+
 	movdqu xmm2,   [ecx-1]
 	psadbw xmm2,   xmm0
 	paddw xmm6,   xmm2
-	
+
 	movdqu xmm3,   [ecx+1]
 	psadbw xmm3,   xmm0
 	paddw xmm7,   xmm3
-	
+
 	movdqu xmm3,   [ecx+edx]
 	psadbw xmm0,   xmm3
 	paddw xmm5,   xmm0
-	
+
 	mov        ecx,  [esp+24]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [ecx],xmm4
 	pop  ebx
@@ -1646,20 +1646,20 @@
 	movdqu xmm3,   [edi]
 	psadbw xmm3,   xmm0
 	paddw xmm4,   xmm3
-	
+
 	movdqa xmm1,   [eax+ebx]
 	movdqu xmm3,   [edi+edx]
 	psadbw xmm3,   xmm1
 	paddw xmm4,   xmm3
-	
+
 	movdqu xmm2,   [edi+edx-1]
 	psadbw xmm2,   xmm0
 	paddw xmm6,   xmm2
-	
+
 	movdqu xmm3,   [edi+edx+1]
 	psadbw xmm3,   xmm0
 	paddw xmm7,   xmm3
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    edi,    [edi+2*edx]
 	movdqa xmm2,   [eax]
@@ -1688,36 +1688,36 @@
 	movdqu xmm3,   [edi]
 	psadbw xmm0,   xmm3
 	paddw xmm5,   xmm0
-	
+
 	movdqu xmm0,   [edi-1]
 	psadbw xmm0,   xmm1
 	paddw xmm6,   xmm0
-	
+
 	movdqu xmm3,   [edi+1]
 	psadbw xmm3,   xmm1
 	paddw xmm7,   xmm3
-	
+
 	movdqu xmm3,   [edi+edx]
 	psadbw xmm1,   xmm3
 	paddw xmm5,   xmm1
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
+
 WELS_EXTERN WelsSampleSadFour8x16_sse2
 WelsSampleSadFour8x16_sse2:
 	push ebx
@@ -1737,10 +1737,10 @@
 	movhps xmm3,   [edi+edx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
@@ -1749,191 +1749,191 @@
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
-	
+
+
 WELS_EXTERN WelsSampleSadFour8x8_sse2
 WelsSampleSadFour8x8_sse2:
 	push ebx
@@ -1953,10 +1953,10 @@
 	movhps xmm3,   [edi+edx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
@@ -1965,99 +1965,99 @@
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
-	
+
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
+
 WELS_EXTERN WelsSampleSadFour4x4_sse2
 WelsSampleSadFour4x4_sse2:
 	push ebx
@@ -2080,23 +2080,23 @@
 	punpckldq  xmm1, xmm2
 	movd       xmm2, [edi+edx-1]
 	movd       xmm3, [edi+edx+1]
-	
+
 	lea        edi,  [edi+2*edx]
-	
+
 	movd       xmm4, [edi]
 	movd       xmm5, [edi-1]
 	punpckldq  xmm2, xmm5
 	movd       xmm5, [edi+1]
 	punpckldq  xmm3, xmm5
-	
+
 	movd       xmm5, [edi+edx]
 	punpckldq  xmm4, xmm5
-	
+
 	punpcklqdq xmm1, xmm4 ;-L
-	
+
 	movd       xmm5, [edi+edx-1]
 	movd       xmm6, [edi+edx+1]
-	
+
 	lea        edi,  [edi+2*edx]
 	movd       xmm7, [edi-1]
 	punpckldq  xmm5, xmm7
@@ -2107,12 +2107,12 @@
 	movd       xmm6, [edi]
 	movd       xmm7, [edi+edx]
 	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L 
+	punpcklqdq xmm4, xmm6 ;+L
 	psadbw     xmm1, xmm0
 	psadbw     xmm2, xmm0
 	psadbw     xmm3, xmm0
 	psadbw     xmm4, xmm0
-	
+
 	movhlps    xmm0, xmm1
 	paddw      xmm1, xmm0
 	movhlps    xmm0, xmm2
@@ -2123,13 +2123,13 @@
 	paddw      xmm4, xmm0
 	mov        edi,  [esp+28]
 	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3 
+	punpckldq  xmm2, xmm3
 	punpcklqdq xmm1, xmm2
 	movdqa     [edi],xmm1
 	pop  edi
 	pop  ebx
 	ret
-	
+
 ;***********************************************************************
 ;
 ;Pixel_sad_4_wxh_sse2 END
@@ -2150,40 +2150,40 @@
 %define pix2address  esp+pushsize+12
 %define pix2stride   esp+pushsize+16
 
-    mov		  eax, [pix1address]    
-    mov		  ebx, [pix1stride ]    
-    mov		  ecx, [pix2address]    
-    mov		  edx, [pix2stride ]    
+    mov		  eax, [pix1address]
+    mov		  ebx, [pix1stride ]
+    mov		  ecx, [pix2address]
+    mov		  edx, [pix2stride ]
 
 	movd	  mm0, [eax]
 	movd	  mm1, [eax+ebx]
 	punpckldq mm0, mm1
-	
+
 	movd      mm3, [ecx]
 	movd      mm4, [ecx+edx]
 	punpckldq mm3, mm4
 	psadbw    mm0, mm3
-	
+
 	lea       eax, [eax+2*ebx]
 	lea       ecx, [ecx+2*edx]
-	
+
 	movd      mm1, [eax]
 	movd      mm2, [eax+ebx]
 	punpckldq mm1, mm2
-	
+
 	movd      mm3, [ecx]
 	movd      mm4, [ecx+edx]
 	punpckldq mm3, mm4
 	psadbw    mm1, mm3
 	paddw     mm0, mm1
-	
+
     movd      eax, mm0
 
 	WELSEMMS
     pop ebx
-%undef pushsize     
-%undef pix1address	
-%undef pix1stride   
-%undef pix2address  
-%undef pix2stride   
+%undef pushsize
+%undef pix1address
+%undef pix1stride
+%undef pix2address
+%undef pix2stride
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -45,7 +45,7 @@
 bits 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 ;***********************************************************************
@@ -59,7 +59,7 @@
 sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
 align 16
 sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 align 16
 sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
 align 16
@@ -139,7 +139,7 @@
     db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
     db  4, 7, 7,11, 4, 8, 7,11, 8,11
     db 11,15, 1, 4, 3, 7, 4, 7, 7,11
-    db  3, 7, 6,10, 7,10,10,14, 4, 7 
+    db  3, 7, 6,10, 7,10,10,14, 4, 7
     db  7,11, 7,10,10,14, 7,11,10,14
     db 11,14,14,18, 0, 4, 3, 7, 3, 6
     db  6,10, 3, 7, 6,10, 7,10,10,14
@@ -191,7 +191,7 @@
 	movdqa     [eax],xmm0
 	movdqa     [eax+16], xmm1
 	ret
-	
+
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
@@ -206,7 +206,7 @@
 	pinsrw		xmm0, eax, 7			; xmm0[7]	=	[8]
 	pinsrw		xmm1, ecx, 0			; xmm1[0]	=	[7]
 	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]	
+	pshufb		xmm0, [pb_scanacdc_maska]
 
 	mov        eax,  [esp+4]
 	movdqa     [eax],xmm0
@@ -224,7 +224,7 @@
 	movdqa     xmm2, xmm0
 	punpcklqdq xmm0, xmm1
 	punpckhqdq xmm2, xmm1
-	
+
 	movdqa     xmm3, xmm0
 	punpckldq  xmm0, xmm2
 	punpckhdq  xmm3, xmm2
@@ -236,10 +236,10 @@
 	pextrw     edx,  xmm3, 0
 	pinsrw     xmm3, eax,  0
 	pinsrw     xmm0, edx,  3
-	
+
 	pshufhw    xmm1, xmm0, 0x93
 	pshuflw    xmm2, xmm3, 0x39
-    
+
     movdqa     xmm3, xmm2
     psrldq     xmm1, 2
     pslldq     xmm3, 14
@@ -255,13 +255,13 @@
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
 ALIGN 16
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
 WelsCalculateSingleCtr4x4_sse2:
 	push      ebx
 	mov       eax,  [esp+8]
 	movdqa    xmm0, [eax]
 	movdqa    xmm1, [eax+16]
-	
+
 	packsswb  xmm0, xmm1
 
     pxor      xmm3, xmm3
@@ -317,7 +317,7 @@
 	and       edx,  0xff
 	shr       ecx,  8
 ;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-	xor       eax,  eax	
+	xor       eax,  eax
 	add       al,  [nozero_count_table+ecx]
 	add       al,  [nozero_count_table+edx]
 	ret
--- a/codec/encoder/core/asm/vaa.asm
+++ b/codec/encoder/core/asm/vaa.asm
@@ -38,7 +38,7 @@
 ;*      04/14/2010	Created
 ;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
 ;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -167,7 +167,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -179,31 +179,31 @@
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
+	movq [esp+8], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
 	movq [esp+24], xmm0
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
 	paddw xmm0, xmm1
 	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
+
 	pmullw xmm1, xmm1
 	pmullw xmm2, xmm2
 	movdqa xmm3, xmm1
@@ -219,7 +219,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low word truncated
 	mov ecx, ebx
@@ -227,7 +227,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -253,7 +253,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -265,25 +265,25 @@
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
+	movq [esp+8], xmm1
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
 	movq [esp+24], xmm1
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
@@ -305,7 +305,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low work truncated
 	mov ecx, ebx
@@ -313,7 +313,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -323,7 +323,7 @@
 	pop edx
 	pop ebx
 	ret
-	
+
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 ;***********************************************************************
 ;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
@@ -331,11 +331,11 @@
 ALIGN 16
 MdInterAnalysisVaaInfo_sse41:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
@@ -342,7 +342,7 @@
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets	
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
 	pshufd xmm4, xmm3, 01Bh
 	paddd xmm4, xmm3
 	pshufd xmm3, xmm4, 0B1h
@@ -354,7 +354,7 @@
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
 
@@ -365,11 +365,11 @@
 ALIGN 16
 MdInterAnalysisVaaInfo_sse2:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
@@ -376,9 +376,9 @@
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	
+
 	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3	
+	movdqa xmm2, xmm3
 	pmuludq xmm2, xmm3
 	pshufd xmm4, xmm3, 0B1h
 	pmuludq xmm4, xmm4
@@ -385,8 +385,8 @@
 	movdqa xmm5, xmm2
 	punpckldq xmm5, xmm4
 	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2	
-	
+	punpcklqdq xmm5, xmm2
+
 	pshufd xmm4, xmm5, 01Bh
 	paddd xmm4, xmm5
 	pshufd xmm5, xmm4, 0B1h
@@ -398,6 +398,6 @@
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
--- a/codec/encoder/plus/res/welsenc.rc
+++ b/codec/encoder/plus/res/welsenc.rc
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
     "#include ""windows.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -1,94 +1,94 @@
-NASM = 1
-NAME      = libwelsvp
-
-OUTDIR    = ../../../bin/linux
-BINDIR    = ../../bin
-OBJDIR    = ../../obj  
-SRCDIRS   = ../../src/asm \
-            ../../src/common \
-            ../../src/adaptivequantization \
-            ../../src/backgounddetection \
-            ../../src/denoise \
-            ../../src/downsample \
-            ../../src/scenechangedetection \
-            ../../src/vaacalc \
-            ../../src/complexityanalysis 
-SRCDIRS  += ../../src/imagerotate
-
-
-TARGETLIB =  $(BINDIR)/$(NAME).so
-
-CC        = $(shell which gcc)
-AS        = $(shell which nasm)
-GCC       = gcc -m32
-
-CPPFLAGS  = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
-LDFLAGS   = -lstdc++ -ldl
-          
-SRCEXTS  = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS  = .h
-SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP  = $(filter %.cpp,$(SOURCES))
-SRC_ASM  = $(filter %.asm,$(SOURCES))
-OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS     = $(OBJS:.o=.d)
-
-DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
-                  echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm   = $(AS)  $(ASMFLAGS)
-LINK          = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-	
-%.d:%.cpp
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_cpp.d) $< >> $@
-	
-%.d:%.asm
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
-	$(COMPILE.cpp) $< -o $@
-	
-%.o:%.asm
-	$(COMPILE.asm) $< -o $@	
-
-tags: $(HEADERS) $(SOURCES)
-	etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
-	ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
-	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
-	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
-	@echo produce the lib to $(TARGETLIB).
-	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
-	@cp -f $(TARGETLIB) $(OUTDIR)
-	@cp -f $(TARGETLIB) ../../../testbin
-	@echo copy the lib to $(OUTDIR).
-
-clean:
-	rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
-	rm -f $(DEPS) TAGS
-
+NASM = 1
+NAME      = libwelsvp
+
+OUTDIR    = ../../../bin/linux
+BINDIR    = ../../bin
+OBJDIR    = ../../obj
+SRCDIRS   = ../../src/asm \
+            ../../src/common \
+            ../../src/adaptivequantization \
+            ../../src/backgounddetection \
+            ../../src/denoise \
+            ../../src/downsample \
+            ../../src/scenechangedetection \
+            ../../src/vaacalc \
+            ../../src/complexityanalysis
+SRCDIRS  += ../../src/imagerotate
+
+
+TARGETLIB =  $(BINDIR)/$(NAME).so
+
+CC        = $(shell which gcc)
+AS        = $(shell which nasm)
+GCC       = gcc -m32
+
+CPPFLAGS  = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
+LDFLAGS   = -lstdc++ -ldl
+
+SRCEXTS  = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS  = .h
+SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP  = $(filter %.cpp,$(SOURCES))
+SRC_ASM  = $(filter %.asm,$(SOURCES))
+OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS     = $(OBJS:.o=.d)
+
+DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+                  echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm   = $(AS)  $(ASMFLAGS)
+LINK          = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+	$(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+	$(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+	etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+	ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+	@echo produce the lib to $(TARGETLIB).
+	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+	@cp -f $(TARGETLIB) $(OUTDIR)
+	@cp -f $(TARGETLIB) ../../../testbin
+	@echo copy the lib to $(OUTDIR).
+
+clean:
+	rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+	rm -f $(DEPS) TAGS
+
--- a/processing/src/asm/asm_inc.asm
+++ b/processing/src/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/processing/src/asm/cpuid.asm
+++ b/processing/src/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -1,263 +1,263 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  predenoise.asm
-;*
-;*  Abstract
-;*      denoise for SVC2.1
-;*  History
-;*      4/13/2010 Created
-;*      7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-	
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
-		
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
-		
-		movdqa		%1,	%3
-		psubusb		%1,	%8
-
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1		
-		paddusw		%4,	%1
-		paddusw		%5,	%2	
-%endmacro
-
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
-%define		pushsize	4
-%define		pixel		esp + pushsize + 4
-%define		stride		esp + pushsize + 8
-BilateralLumaFilter8_sse2:
-		push		ebx
-		
-		pxor		xmm7,	xmm7
-		mov			eax,	[pixel]
-		mov			ebx,	eax
-		movq		xmm6,	[eax]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
-		
-		dec			eax
-		mov			ecx,	[stride]
-		
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
-		
-		sub			eax,	ecx
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
-		
-		lea			eax,	[eax + ecx * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
-		
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[ebx],	xmm5		
-		
-		pop ebx
-		ret	
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
-		mov		edx,	[esp + 4]	; pixels
-		mov		ecx,	[esp + 8]	; stride
-		
-		mov		eax,	ecx
-		add		eax,	eax
-		sub		edx,	eax			; pixels - 2 * stride
-		sub		edx,	2
-			
-		pxor	xmm0,	xmm0	
-		pxor	xmm3,	xmm3
-	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		add		edx,	eax	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		movdqu		xmm1,	[edx + ecx * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
-	
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[edx + 2],		xmm3			
-
-		ret	
\ No newline at end of file
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1
+		paddusw		%4,	%1
+		paddusw		%5,	%2
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+%define		pushsize	4
+%define		pixel		esp + pushsize + 4
+%define		stride		esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+		push		ebx
+
+		pxor		xmm7,	xmm7
+		mov			eax,	[pixel]
+		mov			ebx,	eax
+		movq		xmm6,	[eax]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+
+		dec			eax
+		mov			ecx,	[stride]
+
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
+
+		sub			eax,	ecx
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
+
+		lea			eax,	[eax + ecx * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
+
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[ebx],	xmm5
+
+		pop ebx
+		ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+		mov		edx,	[esp + 4]	; pixels
+		mov		ecx,	[esp + 8]	; stride
+
+		mov		eax,	ecx
+		add		eax,	eax
+		sub		edx,	eax			; pixels - 2 * stride
+		sub		edx,	2
+
+		pxor	xmm0,	xmm0
+		pxor	xmm3,	xmm3
+
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		add		edx,	eax
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[edx + ecx * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[edx + 2],		xmm3
+
+		ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -1,1225 +1,1225 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	upsampling.asm
-;*
-;*  Abstract
-;*		SIMD for pixel domain down sampling
-;*
-;*  History
-;*		10/22/2009	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-	
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
-
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
-
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
-
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
-
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
-	movq [edi  ], mm0
-	movq [edi+8], mm2
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movq [edi  ], mm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
-.xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line	
-	movq mm1, [esi+ecx]		; 2nd pSrc line	
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5	
-
-	; to handle mm2, mm4
-	movq mm0, mm2		; 
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B	
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movd [edi],	mm0	
-
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8	
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8	
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8	
-	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low	
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line	
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a	
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8	
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
-
-	; write pDst
-	movq [edi], xmm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-	movdqa xmm7, [shufb_mask_low]	; mask low	
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line	
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
-
-	; write pDst
-	movq [edi], xmm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-
-
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;                           unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-	
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
-	
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
-	
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
-	
-
-DOWNSAMPLE:
-	
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
-	
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
-	
-HEIGHT:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-	
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-	
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
-	
-WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
-	
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
-	
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
-	
-	loop	WIDTH
-
-WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-	
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-	
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-	
-	dec		dword [tmpHeight]
-	jg		HEIGHT
-
-
-LAST_ROW:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	
-LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	loop	LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
-	
-	
-	
-	
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;               unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-	
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
-	
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
-	
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx					
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
-	
-
-FAST_DOWNSAMPLE:
-	
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
-	
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
-	
-FAST_HEIGHT:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-	
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-	
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
-	
-FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
-	
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	paddw	xmm3,		xmm7			; inc u
-	
-	loop	FAST_WIDTH
-
-FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-	
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-	
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-	
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
-
-
-FAST_LAST_ROW:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	
-FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	loop	FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	upsampling.asm
+;*
+;*  Abstract
+;*		SIMD for pixel domain down sampling
+;*
+;*  History
+;*		10/22/2009	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	; 2nd part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm1, [esi+16]		; 1st pSrc line + 16
+	movq mm2, [esi+24]		; 1st pSrc line + 24
+	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
+	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+
+	; to handle mm1, mm2, mm3, mm4
+	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm5, mm6		; d c D C b a B A
+	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm6, mm7		; h g H G f e F E
+	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm7, mm1		; l k L K j i J I
+	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+
+	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm1, mm2 		; p o P O n m N M
+	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+
+	; to handle mm5, mm6, mm7, mm1
+	movq mm2, mm5
+	punpckldq mm2, mm6 	; H G F E D C B A
+	punpckhdq mm5, mm6 	; h g f e d c b a
+
+	movq mm3, mm7
+	punpckldq mm3, mm1 	; P O N M L K J I
+	punpckhdq mm7, mm1 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+	movq [edi  ], mm0
+	movq [edi+8], mm2
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movq [edi  ], mm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 8 bytes
+.xloops:
+	; 1st part horizonal loop: x8 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A
+	;2nd Line Src:	mm1: h H g G f F e E
+	;=> target:
+	;: H G F E D C B A
+	;: h g f e d c b a
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+ecx]		; 2nd pSrc line
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm2, mm3		; d c D C b a B A
+	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm4, mm5		; h g H G f e F E
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	; to handle mm2, mm4
+	movq mm0, mm2		;
+	punpckldq mm0, mm4 	; H G F E D C B A
+	punpckhdq mm2, mm4 	; h g f e d c b a
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+	pshufw mm1, mm0, 04eh	; 01001110 B
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movd [edi],	mm0
+
+	; next unit
+	lea esi, [esi+8]
+	lea edi, [edi+4]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm4 high bits
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm2 high bits
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+
+
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	32767
+	mov		eax,	[uiScaleX]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
+	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
+	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		40003fffh
+	movd	xmm5,		edx
+	punpcklwd	xmm5,	xmm0					; 16384 16383
+	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,			16384
+	mov		[yInverse],		eax
+
+	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+
+WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	pxor	xmm0,		xmm0
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	movdqa	xmm0,	xmm2
+	pmuludq	xmm2,	xmm1
+	psrlq	xmm0,	32
+	psrlq	xmm1,	32
+	pmuludq	xmm0,	xmm1
+	paddq	xmm2,	xmm0
+	pshufd	xmm1,	xmm2,	00001110b
+	paddq	xmm2,	xmm1
+	psrlq	xmm2,	29
+
+	movd	eax,	xmm2
+	inc		eax
+	shr		eax,	1
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+	psllw	xmm3,		1
+	psrlw	xmm3,		1
+
+	loop	WIDTH
+
+WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		HEIGHT
+
+
+LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+
+
+
+
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	65535
+	mov		eax,	[uiScaleX]
+	and		eax,	edx
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	65535
+	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 uinc 0 -uinc
+	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 vinc 0 -vinc
+	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		80007fffh				; 32768 32767
+	movd	xmm5,		edx
+	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
+	mov		ebx,		16384
+
+
+FAST_DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,		16384
+	mov		[yInverse],		eax
+
+	pshuflw	xmm4,		xmm5,	01010000b
+	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	pmaddwd		xmm2,	xmm1
+	pshufd	xmm1,	xmm2,	00000001b
+	paddd	xmm2,	xmm1
+	movd	xmm1,	ebx
+	paddd	xmm2,	xmm1
+	psrld	xmm2,	15
+
+	packuswb	xmm2,	xmm0
+	movd	eax,	xmm2
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+
+	loop	FAST_WIDTH
+
+FAST_WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
 	ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -1,145 +1,145 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
-    lea     eax,	[eax+ecx*2]
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx+%1],	xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+%1+0x10],	xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
-	SSE2_PRED_H_16X16_TWO_LINE   0x40
-	SSE2_PRED_H_16X16_TWO_LINE   0x60
-	SSE2_PRED_H_16X16_TWO_LINE   0x80
-	SSE2_PRED_H_16X16_TWO_LINE   0xa0
-	SSE2_PRED_H_16X16_TWO_LINE   0xc0
-	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
-    ret
-    
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    sub     eax, ecx
-    movdqa  xmm0, [eax]
-    
-    movdqa  [edx], xmm0
-    movdqa  [edx+10h], xmm0
-    movdqa  [edx+20h], xmm0
-    movdqa  [edx+30h], xmm0
-    movdqa  [edx+40h], xmm0
-    movdqa  [edx+50h], xmm0
-    movdqa  [edx+60h], xmm0
-    movdqa  [edx+70h], xmm0
-    movdqa  [edx+80h], xmm0
-    movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
-	movdqa  [edx+176], xmm0
-    movdqa  [edx+192], xmm0
-    movdqa  [edx+208], xmm0
-    movdqa  [edx+224], xmm0
-    movdqa  [edx+240], xmm0
-    
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+    lea     eax,	[eax+ecx*2]
+
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx+%1],	xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+%1+0x10],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+0x10],	xmm0
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
+	SSE2_PRED_H_16X16_TWO_LINE   0x40
+	SSE2_PRED_H_16X16_TWO_LINE   0x60
+	SSE2_PRED_H_16X16_TWO_LINE   0x80
+	SSE2_PRED_H_16X16_TWO_LINE   0xa0
+	SSE2_PRED_H_16X16_TWO_LINE   0xc0
+	SSE2_PRED_H_16X16_TWO_LINE   0xe0
+
+    ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+
+    sub     eax, ecx
+    movdqa  xmm0, [eax]
+
+    movdqa  [edx], xmm0
+    movdqa  [edx+10h], xmm0
+    movdqa  [edx+20h], xmm0
+    movdqa  [edx+30h], xmm0
+    movdqa  [edx+40h], xmm0
+    movdqa  [edx+50h], xmm0
+    movdqa  [edx+60h], xmm0
+    movdqa  [edx+70h], xmm0
+    movdqa  [edx+80h], xmm0
+    movdqa  [edx+90h], xmm0
+    movdqa  [edx+160], xmm0
+	movdqa  [edx+176], xmm0
+    movdqa  [edx+192], xmm0
+    movdqa  [edx+208], xmm0
+    movdqa  [edx+224], xmm0
+    movdqa  [edx+240], xmm0
+
     ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -1,79 +1,79 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  pixel_sse2.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-  
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  pixel_sse2.asm
+;*
+;*  Abstract
+;*      WelsSampleSad8x8_sse21
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+
 %macro SSE2_GetSad8x4 0
 	movq   xmm0,   [eax]
 	movq   xmm1,   [eax+ebx]
@@ -90,12 +90,12 @@
 	psadbw xmm1,   xmm3
 	paddw  xmm6,   xmm0
 	paddw  xmm6,   xmm1
-%endmacro
+%endmacro
 
 
-;***********************************************************************
-; Code
-;***********************************************************************
+;***********************************************************************
+; Code
+;***********************************************************************
 SECTION .text
 
 WELS_EXTERN WelsSampleSad8x8_sse21
@@ -108,15 +108,15 @@
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
     pxor   xmm7,   xmm7
-    
+
     mov    edi,    ecx
     and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
     mov    edx,    8
     sub    edx,    edi
-    
+
     shl    edi,    3
     shl    edx,    3
     movd   xmm5,   edi
@@ -124,10 +124,10 @@
 	mov    edi,    8
 	add    edi,    ecx
     mov    edx,    [esp+24]
-    
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -135,17 +135,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -153,7 +153,7 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
 
@@ -160,10 +160,10 @@
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -171,17 +171,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -189,10 +189,10 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@@ -202,12 +202,12 @@
     push   ebx
     mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -1,1589 +1,1589 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	vaa.asm
-;*
-;*	Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
-;	movdqa %1, %2
-;	punpcklbw %1, %3
-;	punpckhbw %2, %3
-;	paddw %1, %2
-;	pmaddwd %1, %4
-;	pshufd %2, %1, 04Eh	; 01001110 B
-;	paddd %1, %2
-;	pshufd %2, %1, 0B1h	; 10110001 B
-;	paddd %1, %2
-;%endmacro	; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
-	movdqa %1, %2
-	punpcklbw %1, %3
-	punpckhbw %2, %3
-	pmaddwd %1, %1
-	pmaddwd %2, %2
-	paddd %1, %2
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddd %1, %2
-	pshufd %2, %1, 0B1h	; 10110001 B
-	paddd %1, %2
-%endmacro	; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2  0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	[esi+ebx]
-	movdqa	xmm4,	[edi+ebx]
-	psadbw	xmm1,	xmm2
-	psadbw	xmm3,	xmm4
-	paddd	xmm6,	xmm1
-	paddd	xmm6,	xmm3
-	lea		esi,	[esi+ebx*2]
-	lea		edi,	[edi+ebx*2]	
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm6,	xmm3
-	
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm0
-	paddd	xmm5,	xmm3
-	
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm2
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm7,	xmm3	; sad
-	
-	movdqa	xmm3,	xmm1
-	pmaxub	xmm3,	xmm2
-	pminub	xmm2,	xmm1
-	psubb	xmm3,	xmm2	; diff
-	
-	movdqa	xmm2,	xmm1
-	psadbw	xmm2,	xmm0
-	paddd	xmm6,	xmm2	; sum
-	
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm5,	xmm1
-	paddd		xmm5,	xmm2	; sqsum
-	
-	movdqa		xmm1,	xmm3
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm3	; sqdiff
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
-%define sad_reg			%1
-%define	sum_cur_reg		%2
-%define sum_ref_reg		%3
-%define	mad_reg			%4
-	movdqa	xmm1,		[esi]
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_cur_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	paddd	sum_ref_reg,			xmm3	; sum_ref
-	
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-	
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-	
-	add			esi,		ebx
-	add			edi,		ebx
-%endmacro
-
-
-%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
-%define max_reg  %1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		4
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		2
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		1
-	pmaxub	max_reg,	xmm1
-%endmacro
-
-%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
-%define sad_reg		%1
-%define	sum_reg		%2
-%define mad_reg		%3
-%define sqdiff_reg	%4
-	movdqa		xmm1,		[esi]
-	movdqa		xmm2,		xmm1
-	movdqa		xmm3,		xmm1
-	punpcklbw	xmm2,		xmm0
-	punpckhbw	xmm3,		xmm0
-	pmaddwd		xmm2,		xmm2
-	pmaddwd		xmm3,		xmm3
-	paddd		xmm2,		xmm3
-	movdqa		xmm3,		xmm2
-	psllq		xmm2,		32
-	psrlq		xmm3,		32
-	psllq		xmm3,		32
-	paddd		xmm2,		xmm3
-	paddd		sad_reg,	xmm2		; sqsum
-	
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	pslldq	xmm3,		4
-	paddd	sum_reg,			xmm3	; sum_ref
-	
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-	
-	movdqa	xmm1,		xmm3
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-
-	movdqa		xmm3,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		sqdiff_reg,	xmm1
-	paddd		sqdiff_reg,	xmm3	; sqdiff
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-;	dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
-	push esi
-	push edi
-	push ebp
-	push ebx
-	push edx
-
-	mov esi, [esp+24]
-	mov edi, [esp+28]
-	mov ebx, [esp+32]
-	mov ecx, [esp+36]
-	mov edx, [esp+40]
-	pxor xmm0, xmm0	
-.hloop:
-	mov eax, ebx
-	mov ebp, $0
-.wloop:
-	movdqa xmm1, [esi+ebp]
-	movdqa xmm2, [edi+ebp]
-	psadbw xmm1, xmm2
-	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
-	paddd xmm1, xmm2
-	paddd xmm0, xmm1	
-	add ebp, 010h
-	dec eax
-	jnz near .wloop
-	lea esi, [esi+edx]
-	lea edi, [edi+edx]
-	dec ecx
-	jnz near .hloop
-
-	movd eax, xmm0
-	pop edx
-	pop ebx
-	pop ebp
-	pop edi
-	pop esi
-	ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:	
-	push esi
-	push edi
-	push ebx
-	
-	sub esp, 16
-	%define SUM			[esp]
-	%define SUM_CUR		[esp+4]
-	%define SQR			[esp+8]
-	%define SQR_CUR		[esp+12]
-	%define PUSH_SIZE	28	; 12 + 16	
-
-	mov edi, [esp+PUSH_SIZE+4]	; y_ref
-	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride	
-	mov esi, [esp+PUSH_SIZE+12]	; y_src
-	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
-	mov ecx, 010h				; height = 16
-
-	pxor xmm7, xmm7
-	movdqu SUM, xmm7
-
-.hloops:
-	movdqa xmm0, [edi]		; y_ref
-	movdqa xmm1, [esi]		; y_src
-	movdqa xmm2, xmm0		; store first for future process
-	movdqa xmm3, xmm1
-	; sum += diff;
-	movdqa xmm4, xmm0
-	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
-	; to be continued for sum
-	pshufd xmm5, xmm4, 0C6h	; 11000110 B
-	paddw xmm4, xmm5
-	movd ebx, xmm4
-	add SUM, ebx
-
-	; sqr += diff * diff;
-	pmaxub xmm0, xmm1
-	pminub xmm1, xmm2
-	psubb xmm0, xmm1				; diff	
-	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
-	movd ebx, xmm1
-	add SQR, ebx
-
-	; sum_cur += y_src[x];
-	movdqa xmm0, xmm3		; cur_orig
-	movdqa xmm1, xmm0
-	punpcklbw xmm0, xmm7
-	punpckhbw xmm1, xmm7
-	paddw xmm0, xmm1		; 8x2
-	SUM_WORD_8x2_SSE2 xmm0, xmm1	
-	movd ebx, xmm0
-	and ebx, 0ffffh
-	add SUM_CUR, ebx
-
-	; sqr_cur += y_src[x] * y_src[x];
-	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
-	movd ebx, xmm0
-	add SQR_CUR, ebx
-	
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .hloops
-	
-	mov ebx, 0
-	mov bx, word SUM
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR
-	sar ecx, 8
-	sub ecx, ebx
-	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
-	mov [edi], cx				; to store uiMotionIndex
-	mov ebx, 0
-	mov bx, word SUM_CUR
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR_CUR
-	sar ecx, 8
-	sub ecx, ebx
-	mov [edi+2], cx				; to store uiTextureIndex
-	
-	%undef SUM
-	%undef SUM_CUR
-	%undef SQR
-	%undef SQR_CUR
-	%undef PUSH_SIZE
-
-	add esp, 16	
-	pop ebx
-	pop edi
-	pop esi	
-
-	ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32	
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-	
-	pxor xmm7, xmm7
-	
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+24], xmm0
-		
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low word truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-	
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-        
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32	
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-	
-	pxor xmm7, xmm7
-	
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+24], xmm1
-		
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low work truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-	
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-%endif
-	
-	
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, 
-;								 int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define		ref_orig			esp + pushsize + 4
-%define		cur_orig			esp + pushsize + 8
-%define		iPicStride			esp + pushsize + 12
-%define		gom_pixel_num		esp + pushsize + 16
-%define		pSum				esp + pushsize + 20
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[ref_orig]
-	mov		edi,	[cur_orig]
-	mov		ebx,	[iPicStride]
-	mov		eax,	[gom_pixel_num]
-	mov		ecx,	16					;MB_WIDTH_LUMA
-	pxor	xmm0,	xmm0
-mb_width_loop_p:
-	mov		edx,	esi
-	add		edx,	eax			; end address
-gom_row_loop_p:
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	psadbw	xmm1,	xmm2
-	paddd	xmm0,	xmm1
-	add		esi,	16
-	add		edi,	16
-	cmp		esi,	edx
-	jl		gom_row_loop_p
-	
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	ebx
-	add		edi,	ebx
-	loop	mb_width_loop_p
-	
-	movdqa	xmm1,	xmm0
-	psrldq	xmm1,	8
-	paddd	xmm1,	xmm0
-	movd	eax,	xmm1
-	mov		edx,	[pSum]	; pSum
-	add		[edx],	eax
-
-%undef		ref_orig
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pushsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, 
-;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define		cur_orig			esp + pushsize + 4
-%define		iPicStride			esp + pushsize + 8
-%define		gom_pixel_num		esp + pushsize + 12
-%define		pSum				esp + pushsize + 16
-%define		pSqrSum				esp + pushsize + 20
-%define		pushsize			8
-	push		esi
-	push		ebx
-	mov			esi,	[cur_orig]
-	mov			eax,	[gom_pixel_num]
-	mov			ebx,	[iPicStride]
-	mov			ecx,	16					;MB_WIDTH_LUMA
-	pxor		xmm0,	xmm0				; zero
-	pxor		xmm1,	xmm1				; sum
-	pxor		xmm2,	xmm2				; sqr sum
-mb_width_loop_i:
-	mov			edx,	esi
-	add			edx,	eax			; end address
-gom_row_loop_i:
-	movdqa		xmm3,	[esi]
-	movdqa		xmm4,	xmm3
-	psadbw		xmm4,	xmm0
-	paddd		xmm1,	xmm4
-	movdqa		xmm4,	xmm3
-	punpcklbw	xmm4,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm4,	xmm4
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm2,	xmm3
-	paddd		xmm2,	xmm4
-	add			esi,	16
-	cmp			esi,	edx
-	jl			gom_row_loop_i
-	
-	sub			esi,	eax
-	add			esi,	ebx
-	loop		mb_width_loop_i
-	
-	movdqa		xmm3,	xmm1
-	psrldq		xmm3,	8
-	paddd		xmm1,	xmm3
-	movd		eax,	xmm1
-	mov			edx,	[pSum]
-	add			[edx],	eax
-	
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	8
-	paddd		xmm2,	xmm3
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	4
-	paddd		xmm2,	xmm3
-	movd		eax,	xmm2
-	mov			edx,	[pSqrSum]
-	add			[edx],	eax
-
-
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pSqrSum
-%undef		pushsize	
-	pop			ebx
-	pop			esi
-	ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define		cur_data			esp + pushsize + 4
-%define		ref_data			esp + pushsize + 8
-%define		iPicWidth			esp + pushsize + 12
-%define		iPicHeight			esp + pushsize + 16
-%define		iPicStride			esp + pushsize + 20
-%define		psadframe			esp + pushsize + 24
-%define		psad8x8				esp + pushsize + 28
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4								; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-height_loop:
-	mov		ecx,	dword [iPicWidth]
-	push	esi
-	push	edi
-width_loop:
-	pxor	xmm6,	xmm6		;
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-	
-	pxor	xmm6,	xmm6
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		width_loop
-	
-	pop		edi
-	pop		esi
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		height_loop
-	
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		pushsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-	
-	
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 
-;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define		localsize		8
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-var_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-var_width_loop:
-	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
-	pxor	xmm5,	xmm5		; pSum16x16
-	pxor	xmm4,	xmm4		; sqsum_16x16
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-	
-	pxor	xmm6,	xmm6
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-	
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm5
-	psrldq	xmm1,	8
-	paddd	xmm5,	xmm1
-	movd	[ebp],	xmm5
-	add		dword [psum16x16], 4
-	
-	movdqa	xmm5,	xmm4
-	psrldq	xmm5,	8
-	paddd	xmm4,	xmm5
-	movdqa	xmm3,	xmm4
-	psrldq	xmm3,	4
-	paddd	xmm4,	xmm3
-	
-	mov		ebp,	[psqsum16x16]
-	movd	[ebp],	xmm4
-	add		dword [psqsum16x16], 4
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		var_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		var_height_loop
-	
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-	
-	
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,  
-;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_width_loop:
-	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
-	pxor	xmm6,	xmm6		; pSum16x16
-	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx],		xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+4],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-	
-	pxor	xmm7,	xmm7
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx+8],	xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+12],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-	
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm6
-	psrldq	xmm1,	8
-	paddd	xmm6,	xmm1
-	movd	[ebp],	xmm6
-	add		dword [psum16x16], 4
-	
-	mov		ebp,	[psqsum16x16]
-	pshufd	xmm6,	xmm5,	14 ;00001110
-	paddd	xmm6,	xmm5
-	pshufd	xmm5,	xmm6,	1  ;00000001
-	paddd	xmm5,	xmm6
-	movd	[ebp],	xmm5
-	add		dword [psqsum16x16], 4
-	
-	mov		ebp,	[psqdiff16x16]
-	pshufd	xmm5,	xmm4,	14	; 00001110
-	paddd	xmm5,	xmm4
-	pshufd	xmm4,	xmm5,	1	; 00000001
-	paddd	xmm4,	xmm5
-	movd	[ebp],	xmm4
-	add		dword	[psqdiff16x16],	4
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		sqdiff_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		sqdiff_height_loop
-	
-	mov		ebx,	[tmp_sadframe]
-	mov		eax,	[psadframe]
-	mov		[eax],	ebx
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		tmp_sadframe
-%undef		pushsize
-%undef		localsize
-	ret
-	
-	
-	
-	
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		p_sd8x8				esp + pushsize + localsize + 32
-%define		p_mad8x8			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_ecx				esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	xor		ebp,	ebp
-	pxor	xmm0,	xmm0
-bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8
-	pxor	xmm6,	xmm6		; sum_cur_8x8
-	pxor	xmm5,	xmm5		; sum_ref_8x8
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-	
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-	
-	pslldq		xmm7,	4
-	pslldq		xmm6,	4
-	pslldq		xmm5,	4
-	
-	
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-	
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-	
-	mov		edx,	[psad8x8]
-	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
-	movdqa	[edx],	xmm1					
-	add		edx,	16
-	mov		[psad8x8],	edx					; sad8x8
-	
-	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
-	pshufd	xmm2,	xmm1,	00000011b
-	paddd	xmm1,	xmm2
-	movd	edx,	xmm1
-	add		ebp,	edx						; sad frame
-	
-	mov		edx,	[p_sd8x8]
-	psubd	xmm6,	xmm5
-	pshufd	xmm1,	xmm6,	10001101b
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[p_sd8x8],	edx
-	
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		bgd_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec		dword [iPicHeight]
-	jnz		bgd_height_loop
-	
-	mov		edx,	[psadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 
-;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define		localsize		16
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		p_sd8x8				esp + pushsize + localsize + 44
-%define		p_mad8x8			esp + pushsize + localsize + 48
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		tmp_ecx				esp + 12
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-	
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-	
-	mov		edx,		[psum16x16]
-	movdqa	xmm1,		xmm6
-	pshufd	xmm2,		xmm1,		00001110b
-	paddd	xmm1,		xmm2
-	movd	[edx],		xmm1				; sum
-	
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	psrlq	xmm7,	32
-	psllq	xmm7,	32			; clear sad
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-	
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-	
-	mov		edx,			[psum16x16]
-	movdqa	xmm1,			xmm6
-	pshufd	xmm2,			xmm1,		00001110b
-	paddd	xmm1,			xmm2
-	movd	ebp,			xmm1				; sum
-	add		[edx],			ebp
-	add		edx,			4
-	mov		[psum16x16],	edx
-	
-	mov		edx,			[psqsum16x16]
-	psrlq	xmm7,			32
-	pshufd	xmm2,			xmm7,		00001110b
-	paddd	xmm2,			xmm7
-	movd	[edx],			xmm2				; sqsum
-	add		edx,			4
-	mov		[psqsum16x16],	edx
-	
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-	
-	mov		edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	mov		edx,		[psqdiff16x16]
-	pshufd	xmm1,		xmm4,		00001110b
-	paddd	xmm4,		xmm1
-	pshufd	xmm1,		xmm4,		00000001b
-	paddd	xmm4,		xmm1
-	movd	[edx],		xmm4
-	add		edx,		4
-	mov		[psqdiff16x16],	edx
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		sqdiff_bgd_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		sqdiff_bgd_height_loop
-	
-	mov		edx,	[psadframe]
-	mov		ebp,	[tmp_sadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
+;	movdqa %1, %2
+;	punpcklbw %1, %3
+;	punpckhbw %2, %3
+;	paddw %1, %2
+;	pmaddwd %1, %4
+;	pshufd %2, %1, 04Eh	; 01001110 B
+;	paddd %1, %2
+;	pshufd %2, %1, 0B1h	; 10110001 B
+;	paddd %1, %2
+;%endmacro	; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
+	movdqa %1, %2
+	punpcklbw %1, %3
+	punpckhbw %2, %3
+	pmaddwd %1, %1
+	pmaddwd %2, %2
+	paddd %1, %2
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddd %1, %2
+	pshufd %2, %1, 0B1h	; 10110001 B
+	paddd %1, %2
+%endmacro	; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2  0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	[esi+ebx]
+	movdqa	xmm4,	[edi+ebx]
+	psadbw	xmm1,	xmm2
+	psadbw	xmm3,	xmm4
+	paddd	xmm6,	xmm1
+	paddd	xmm6,	xmm3
+	lea		esi,	[esi+ebx*2]
+	lea		edi,	[edi+ebx*2]
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm6,	xmm3
+
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm0
+	paddd	xmm5,	xmm3
+
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm2
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm7,	xmm3	; sad
+
+	movdqa	xmm3,	xmm1
+	pmaxub	xmm3,	xmm2
+	pminub	xmm2,	xmm1
+	psubb	xmm3,	xmm2	; diff
+
+	movdqa	xmm2,	xmm1
+	psadbw	xmm2,	xmm0
+	paddd	xmm6,	xmm2	; sum
+
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm5,	xmm1
+	paddd		xmm5,	xmm2	; sqsum
+
+	movdqa		xmm1,	xmm3
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm3	; sqdiff
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
+%define sad_reg			%1
+%define	sum_cur_reg		%2
+%define sum_ref_reg		%3
+%define	mad_reg			%4
+	movdqa	xmm1,		[esi]
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_cur_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	paddd	sum_ref_reg,			xmm3	; sum_ref
+
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	add			esi,		ebx
+	add			edi,		ebx
+%endmacro
+
+
+%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
+%define max_reg  %1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		4
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		2
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		1
+	pmaxub	max_reg,	xmm1
+%endmacro
+
+%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
+%define sad_reg		%1
+%define	sum_reg		%2
+%define mad_reg		%3
+%define sqdiff_reg	%4
+	movdqa		xmm1,		[esi]
+	movdqa		xmm2,		xmm1
+	movdqa		xmm3,		xmm1
+	punpcklbw	xmm2,		xmm0
+	punpckhbw	xmm3,		xmm0
+	pmaddwd		xmm2,		xmm2
+	pmaddwd		xmm3,		xmm3
+	paddd		xmm2,		xmm3
+	movdqa		xmm3,		xmm2
+	psllq		xmm2,		32
+	psrlq		xmm3,		32
+	psllq		xmm3,		32
+	paddd		xmm2,		xmm3
+	paddd		sad_reg,	xmm2		; sqsum
+
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	pslldq	xmm3,		4
+	paddd	sum_reg,			xmm3	; sum_ref
+
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+
+	movdqa	xmm1,		xmm3
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	movdqa		xmm3,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		sqdiff_reg,	xmm1
+	paddd		sqdiff_reg,	xmm3	; sqdiff
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+	push esi
+	push edi
+	push ebp
+	push ebx
+	push edx
+
+	mov esi, [esp+24]
+	mov edi, [esp+28]
+	mov ebx, [esp+32]
+	mov ecx, [esp+36]
+	mov edx, [esp+40]
+	pxor xmm0, xmm0
+.hloop:
+	mov eax, ebx
+	mov ebp, $0
+.wloop:
+	movdqa xmm1, [esi+ebp]
+	movdqa xmm2, [edi+ebp]
+	psadbw xmm1, xmm2
+	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
+	paddd xmm1, xmm2
+	paddd xmm0, xmm1
+	add ebp, 010h
+	dec eax
+	jnz near .wloop
+	lea esi, [esi+edx]
+	lea edi, [edi+edx]
+	dec ecx
+	jnz near .hloop
+
+	movd eax, xmm0
+	pop edx
+	pop ebx
+	pop ebp
+	pop edi
+	pop esi
+	ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:
+	push esi
+	push edi
+	push ebx
+
+	sub esp, 16
+	%define SUM			[esp]
+	%define SUM_CUR		[esp+4]
+	%define SQR			[esp+8]
+	%define SQR_CUR		[esp+12]
+	%define PUSH_SIZE	28	; 12 + 16
+
+	mov edi, [esp+PUSH_SIZE+4]	; y_ref
+	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride
+	mov esi, [esp+PUSH_SIZE+12]	; y_src
+	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
+	mov ecx, 010h				; height = 16
+
+	pxor xmm7, xmm7
+	movdqu SUM, xmm7
+
+.hloops:
+	movdqa xmm0, [edi]		; y_ref
+	movdqa xmm1, [esi]		; y_src
+	movdqa xmm2, xmm0		; store first for future process
+	movdqa xmm3, xmm1
+	; sum += diff;
+	movdqa xmm4, xmm0
+	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
+	; to be continued for sum
+	pshufd xmm5, xmm4, 0C6h	; 11000110 B
+	paddw xmm4, xmm5
+	movd ebx, xmm4
+	add SUM, ebx
+
+	; sqr += diff * diff;
+	pmaxub xmm0, xmm1
+	pminub xmm1, xmm2
+	psubb xmm0, xmm1				; diff
+	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
+	movd ebx, xmm1
+	add SQR, ebx
+
+	; sum_cur += y_src[x];
+	movdqa xmm0, xmm3		; cur_orig
+	movdqa xmm1, xmm0
+	punpcklbw xmm0, xmm7
+	punpckhbw xmm1, xmm7
+	paddw xmm0, xmm1		; 8x2
+	SUM_WORD_8x2_SSE2 xmm0, xmm1
+	movd ebx, xmm0
+	and ebx, 0ffffh
+	add SUM_CUR, ebx
+
+	; sqr_cur += y_src[x] * y_src[x];
+	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
+	movd ebx, xmm0
+	add SQR_CUR, ebx
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
+	dec ecx
+	jnz near .hloops
+
+	mov ebx, 0
+	mov bx, word SUM
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR
+	sar ecx, 8
+	sub ecx, ebx
+	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
+	mov [edi], cx				; to store uiMotionIndex
+	mov ebx, 0
+	mov bx, word SUM_CUR
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR_CUR
+	sar ecx, 8
+	sub ecx, ebx
+	mov [edi+2], cx				; to store uiTextureIndex
+
+	%undef SUM
+	%undef SUM_CUR
+	%undef SQR
+	%undef SQR_CUR
+	%undef PUSH_SIZE
+
+	add esp, 16
+	pop ebx
+	pop edi
+	pop esi
+
+	ret
+
+; , 6/7/2010
+
+%ifndef NO_DYNAMIC_VP
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+8], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+24], xmm0
+
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low word truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+8], xmm1
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+24], xmm1
+
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low work truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+%endif
+
+
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+;								 int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define		ref_orig			esp + pushsize + 4
+%define		cur_orig			esp + pushsize + 8
+%define		iPicStride			esp + pushsize + 12
+%define		gom_pixel_num		esp + pushsize + 16
+%define		pSum				esp + pushsize + 20
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[ref_orig]
+	mov		edi,	[cur_orig]
+	mov		ebx,	[iPicStride]
+	mov		eax,	[gom_pixel_num]
+	mov		ecx,	16					;MB_WIDTH_LUMA
+	pxor	xmm0,	xmm0
+mb_width_loop_p:
+	mov		edx,	esi
+	add		edx,	eax			; end address
+gom_row_loop_p:
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	psadbw	xmm1,	xmm2
+	paddd	xmm0,	xmm1
+	add		esi,	16
+	add		edi,	16
+	cmp		esi,	edx
+	jl		gom_row_loop_p
+
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	ebx
+	add		edi,	ebx
+	loop	mb_width_loop_p
+
+	movdqa	xmm1,	xmm0
+	psrldq	xmm1,	8
+	paddd	xmm1,	xmm0
+	movd	eax,	xmm1
+	mov		edx,	[pSum]	; pSum
+	add		[edx],	eax
+
+%undef		ref_orig
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pushsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define		cur_orig			esp + pushsize + 4
+%define		iPicStride			esp + pushsize + 8
+%define		gom_pixel_num		esp + pushsize + 12
+%define		pSum				esp + pushsize + 16
+%define		pSqrSum				esp + pushsize + 20
+%define		pushsize			8
+	push		esi
+	push		ebx
+	mov			esi,	[cur_orig]
+	mov			eax,	[gom_pixel_num]
+	mov			ebx,	[iPicStride]
+	mov			ecx,	16					;MB_WIDTH_LUMA
+	pxor		xmm0,	xmm0				; zero
+	pxor		xmm1,	xmm1				; sum
+	pxor		xmm2,	xmm2				; sqr sum
+mb_width_loop_i:
+	mov			edx,	esi
+	add			edx,	eax			; end address
+gom_row_loop_i:
+	movdqa		xmm3,	[esi]
+	movdqa		xmm4,	xmm3
+	psadbw		xmm4,	xmm0
+	paddd		xmm1,	xmm4
+	movdqa		xmm4,	xmm3
+	punpcklbw	xmm4,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm4,	xmm4
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm2,	xmm3
+	paddd		xmm2,	xmm4
+	add			esi,	16
+	cmp			esi,	edx
+	jl			gom_row_loop_i
+
+	sub			esi,	eax
+	add			esi,	ebx
+	loop		mb_width_loop_i
+
+	movdqa		xmm3,	xmm1
+	psrldq		xmm3,	8
+	paddd		xmm1,	xmm3
+	movd		eax,	xmm1
+	mov			edx,	[pSum]
+	add			[edx],	eax
+
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	8
+	paddd		xmm2,	xmm3
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	4
+	paddd		xmm2,	xmm3
+	movd		eax,	xmm2
+	mov			edx,	[pSqrSum]
+	add			[edx],	eax
+
+
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pSqrSum
+%undef		pushsize
+	pop			ebx
+	pop			esi
+	ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define		cur_data			esp + pushsize + 4
+%define		ref_data			esp + pushsize + 8
+%define		iPicWidth			esp + pushsize + 12
+%define		iPicHeight			esp + pushsize + 16
+%define		iPicStride			esp + pushsize + 20
+%define		psadframe			esp + pushsize + 24
+%define		psad8x8				esp + pushsize + 28
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4								; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+height_loop:
+	mov		ecx,	dword [iPicWidth]
+	push	esi
+	push	edi
+width_loop:
+	pxor	xmm6,	xmm6		;
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+
+	pxor	xmm6,	xmm6
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		width_loop
+
+	pop		edi
+	pop		esi
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		height_loop
+
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		pushsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define		localsize		8
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+var_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+var_width_loop:
+	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
+	pxor	xmm5,	xmm5		; pSum16x16
+	pxor	xmm4,	xmm4		; sqsum_16x16
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+
+	pxor	xmm6,	xmm6
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm5
+	psrldq	xmm1,	8
+	paddd	xmm5,	xmm1
+	movd	[ebp],	xmm5
+	add		dword [psum16x16], 4
+
+	movdqa	xmm5,	xmm4
+	psrldq	xmm5,	8
+	paddd	xmm4,	xmm5
+	movdqa	xmm3,	xmm4
+	psrldq	xmm3,	4
+	paddd	xmm4,	xmm3
+
+	mov		ebp,	[psqsum16x16]
+	movd	[ebp],	xmm4
+	add		dword [psqsum16x16], 4
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		var_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		var_height_loop
+
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_width_loop:
+	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
+	pxor	xmm6,	xmm6		; pSum16x16
+	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx],		xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+4],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+
+	pxor	xmm7,	xmm7
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx+8],	xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+12],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm6
+	psrldq	xmm1,	8
+	paddd	xmm6,	xmm1
+	movd	[ebp],	xmm6
+	add		dword [psum16x16], 4
+
+	mov		ebp,	[psqsum16x16]
+	pshufd	xmm6,	xmm5,	14 ;00001110
+	paddd	xmm6,	xmm5
+	pshufd	xmm5,	xmm6,	1  ;00000001
+	paddd	xmm5,	xmm6
+	movd	[ebp],	xmm5
+	add		dword [psqsum16x16], 4
+
+	mov		ebp,	[psqdiff16x16]
+	pshufd	xmm5,	xmm4,	14	; 00001110
+	paddd	xmm5,	xmm4
+	pshufd	xmm4,	xmm5,	1	; 00000001
+	paddd	xmm4,	xmm5
+	movd	[ebp],	xmm4
+	add		dword	[psqdiff16x16],	4
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		sqdiff_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		sqdiff_height_loop
+
+	mov		ebx,	[tmp_sadframe]
+	mov		eax,	[psadframe]
+	mov		[eax],	ebx
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		tmp_sadframe
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		p_sd8x8				esp + pushsize + localsize + 32
+%define		p_mad8x8			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_ecx				esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	xor		ebp,	ebp
+	pxor	xmm0,	xmm0
+bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8
+	pxor	xmm6,	xmm6		; sum_cur_8x8
+	pxor	xmm5,	xmm5		; sum_ref_8x8
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+
+	pslldq		xmm7,	4
+	pslldq		xmm6,	4
+	pslldq		xmm5,	4
+
+
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+
+	mov		edx,	[psad8x8]
+	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[psad8x8],	edx					; sad8x8
+
+	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
+	pshufd	xmm2,	xmm1,	00000011b
+	paddd	xmm1,	xmm2
+	movd	edx,	xmm1
+	add		ebp,	edx						; sad frame
+
+	mov		edx,	[p_sd8x8]
+	psubd	xmm6,	xmm5
+	pshufd	xmm1,	xmm6,	10001101b
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[p_sd8x8],	edx
+
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		bgd_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec		dword [iPicHeight]
+	jnz		bgd_height_loop
+
+	mov		edx,	[psadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define		localsize		16
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		p_sd8x8				esp + pushsize + localsize + 44
+%define		p_mad8x8			esp + pushsize + localsize + 48
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		tmp_ecx				esp + 12
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+
+	mov		edx,		[psum16x16]
+	movdqa	xmm1,		xmm6
+	pshufd	xmm2,		xmm1,		00001110b
+	paddd	xmm1,		xmm2
+	movd	[edx],		xmm1				; sum
+
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	psrlq	xmm7,	32
+	psllq	xmm7,	32			; clear sad
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+
+	mov		edx,			[psum16x16]
+	movdqa	xmm1,			xmm6
+	pshufd	xmm2,			xmm1,		00001110b
+	paddd	xmm1,			xmm2
+	movd	ebp,			xmm1				; sum
+	add		[edx],			ebp
+	add		edx,			4
+	mov		[psum16x16],	edx
+
+	mov		edx,			[psqsum16x16]
+	psrlq	xmm7,			32
+	pshufd	xmm2,			xmm7,		00001110b
+	paddd	xmm2,			xmm7
+	movd	[edx],			xmm2				; sqsum
+	add		edx,			4
+	mov		[psqsum16x16],	edx
+
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+
+	mov		edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	mov		edx,		[psqdiff16x16]
+	pshufd	xmm1,		xmm4,		00001110b
+	paddd	xmm4,		xmm1
+	pshufd	xmm1,		xmm4,		00000001b
+	paddd	xmm4,		xmm1
+	movd	[edx],		xmm4
+	add		edx,		4
+	mov		[psqdiff16x16],	edx
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		sqdiff_bgd_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		sqdiff_bgd_height_loop
+
+	mov		edx,	[psadframe]
+	mov		ebp,	[tmp_sadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -1,36 +1,36 @@
-;*!
-;* \copy
-;*     Copyright (c)  2011-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY		    welsvp.dll
-EXPORTS
-                CreateVpInterface    PRIVATE
-                DestroyVpInterface   PRIVATE      
\ No newline at end of file
+;*!
+;* \copy
+;*     Copyright (c)  2011-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY		    welsvp.dll
+EXPORTS
+                CreateVpInterface    PRIVATE
+                DestroyVpInterface   PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ b/processing/src/common/WelsVP.rc
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
     "#include ""windows.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/testbin/AutoBuild_Windows_VS2008.bat
+++ b/testbin/AutoBuild_Windows_VS2008.bat
@@ -23,7 +23,7 @@
 rem call VP build
 echo "Welsvp Building....."
 cd %VPProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsVP_2008.vcproj
 
 
@@ -33,7 +33,7 @@
 
 cd %CurDir%
 cd %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsEncCore.vcproj
 %VCBUILDEXE% WelsEncPlus.vcproj
 %VCBUILDEXE% encConsole.vcproj
@@ -44,7 +44,7 @@
 
 cd %CurDir%
 cd %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsDecCore.vcproj
 %VCBUILDEXE% WelsDecPlus.vcproj
 %VCBUILDEXE% decConsole.vcproj
--- a/testbin/AutoBuild_Windows_VS2010.bat
+++ b/testbin/AutoBuild_Windows_VS2010.bat
@@ -36,7 +36,7 @@
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
@@ -49,7 +49,7 @@
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2010.sln
 
--- a/testbin/AutoBuild_Windows_VS2012.bat
+++ b/testbin/AutoBuild_Windows_VS2012.bat
@@ -36,7 +36,7 @@
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
@@ -49,7 +49,7 @@
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2012.sln
 
--- a/testbin/layer2.cfg
+++ b/testbin/layer2.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd.cfg
+++ b/testbin/layer2_vd.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd_rc.cfg
+++ b/testbin/layer2_vd_rc.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				1						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0.cfg		# Layer 0 configuration file
-//LayerCfg                layer1.cfg		# Layer 1 configuration file
-LayerCfg                layer2.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				1						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0.cfg		# Layer 0 configuration file
+//LayerCfg                layer1.cfg		# Layer 1 configuration file
+LayerCfg                layer2.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_1d.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				0						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_1d.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				0						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_rc.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
-
-GOPSize                 8                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC			1						# ENABLE RC
-TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_rc.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
+
+GOPSize                 8                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC			1						# ENABLE RC
+TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file