shithub: openh264

Download patch

ref: ec84f4bcc90d6aa447860eba8235420edc79e41f
parent: 3958118bf03c92aa547dbe3c77c5557ed4ad944b
author: volvet <qizh@cisco.com>
date: Fri Jan 3 09:49:45 EST 2014

resolve conflict

--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@
 CP=cp
 ROOTDIR=$(PWD)
 
+
 ifeq (,$(wildcard ./gtest))
 HAVE_GTEST=No
 else
@@ -13,20 +14,22 @@
 # Configurations
 ifeq ($(BUILDTYPE), Release)
 CFLAGS += -O3
-ifneq ($(ENABLE64BIT), Yes)
 USE_ASM = Yes
-endif
 else
 CFLAGS = -g
 USE_ASM = No
 endif
+
 ifeq ($(ENABLE64BIT), Yes)
 CFLAGS += -m64
 LDFLAGS += -m64
+ASMFLAGS += -DUNIX64
 else
 CFLAGS += -m32
 LDFLAGS += -m32
+ASMFLAGS += -DX86_32
 endif
+
 include build/platform-$(UNAME).mk
 
 ifeq ($(USE_ASM),Yes)
@@ -40,7 +43,8 @@
 
 #### No user-serviceable parts below this line
 INCLUDES = -Icodec/api/svc  -Icodec/common -Igtest/include
-ASM_INCLUDES = -Iprocessing/src/asm/
+#ASM_INCLUDES = -Iprocessing/src/asm/
+ASM_INCLUDES = -Icodec/common/
 
 COMMON_INCLUDES = \
     -Icodec/decoder/core/inc
@@ -83,7 +87,7 @@
 include codec/common/targets.mk
 include codec/decoder/targets.mk
 include codec/encoder/targets.mk
-include processing/targets.mk
+include codec/processing/targets.mk
 include codec/console/dec/targets.mk
 include codec/console/enc/targets.mk
 
--- a/build/mktargets.sh
+++ b/build/mktargets.sh
@@ -2,7 +2,7 @@
 (cd codec/decoder; python ../../build/mktargets.py --directory codec/decoder --library decoder --exclude StdAfx.cpp)
 (cd codec/encoder; python ../../build/mktargets.py --directory codec/encoder --library encoder --exclude DllEntry.cpp)
 (cd codec/common; python ../../build/mktargets.py --directory codec/common --library common)
-(cd processing; python ../build/mktargets.py --directory processing --library processing --exclude wels_process.cpp --exclude WelsVideoProcessor.cpp)
+(cd codec/processing; python ../../build/mktargets.py --directory codec/processing --library processing --exclude wels_process.cpp --exclude WelsVideoProcessor.cpp)
 
 (cd codec/console/dec; python ../../../build/mktargets.py --directory codec/console/dec --binary h264dec --exclude dec_console.h --exclude load_bundle_functions.cpp)
 (cd codec/console/enc; python ../../../build/mktargets.py --directory codec/console/enc --binary h264enc --exclude enc_console.h --exclude bundlewelsenc.cpp)
--- a/build/platform-darwin.mk
+++ b/build/platform-darwin.mk
@@ -1,5 +1,11 @@
-USE_ASM = No  # We don't have ASM working on Mac yet
+
 ASM = nasm
 CFLAGS += -Werror -fPIC
 LDFLAGS += -lpthread
-ASMFLAGS += -f macho --prefix _ -DNOPREFIX
+ASMFLAGS += --prefix _ -DNOPREFIX
+ifeq ($(ENABLE64BIT), Yes)
+ASMFLAGS += -f macho64
+else
+ASMFLAGS += -f macho
+endif
+
--- a/build/platform-linux.mk
+++ b/build/platform-linux.mk
@@ -1,5 +1,10 @@
 ASM = nasm
 CFLAGS += -Werror -fPIC -DLINUX -D__NO_CTYPE
 LDFLAGS += -lpthread
-ASMFLAGS += -f elf -DNOPREFIX
+ASMFLAGS += -DNOPREFIX
+ifeq ($(ENABLE64BIT), Yes)
+ASMFLAGS += -f elf64
+else 
+ASMFLAGS += -f elf32
+endif
 
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -349,44 +349,6 @@
 				Filter="*.asm;*.inc"
 				>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\asm_inc.asm"
-					>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-							Outputs="$(IntDir)\$(InputName).obj"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|x64"
-						ExcludedFromBuild="true"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-							Outputs="$(IntDir)\$(InputName).obj"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|x64"
-						ExcludedFromBuild="true"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
 					RelativePath="..\..\..\decoder\core\asm\block_add.asm"
 					>
 					<FileConfiguration
@@ -394,17 +356,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -413,23 +374,22 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\cpuid.asm"
+					RelativePath="..\..\..\common\cpuid.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -436,17 +396,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -455,17 +414,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -478,17 +436,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -497,23 +454,22 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\deblock.asm"
+					RelativePath="..\..\..\common\deblock.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -520,17 +476,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -539,23 +494,22 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\expand_picture.asm"
+					RelativePath="..\..\..\common\expand_picture.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -562,17 +516,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -581,17 +534,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -604,17 +556,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -623,23 +574,22 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\mb_copy.asm"
+					RelativePath="..\..\..\common\mb_copy.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -646,17 +596,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -665,23 +614,22 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\mc_chroma.asm"
+					RelativePath="..\..\..\common\mc_chroma.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -688,17 +636,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -707,23 +654,22 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\decoder\core\asm\mc_luma.asm"
+					RelativePath="..\..\..\common\mc_luma.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -730,17 +676,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Release|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -749,59 +694,16 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 					<FileConfiguration
 						Name="Debug|x64"
-						ExcludedFromBuild="true"
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-							Outputs="$(IntDir)\$(InputName).obj"
-						/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\..\decoder\core\asm\memzero.asm"
-					>
-					<FileConfiguration
-						Name="Release|Win32"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-							Outputs="$(IntDir)\$(InputName).obj"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Release|x64"
-						ExcludedFromBuild="true"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-							Outputs="$(IntDir)\$(InputName).obj"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-							Outputs="$(IntDir)\$(InputName).obj"
-						/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|x64"
-						ExcludedFromBuild="true"
-						>
-						<Tool
-							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
--- a/codec/build/win32/dec/WelsDecCore_2010.vcxproj
+++ b/codec/build/win32/dec/WelsDecCore_2010.vcxproj
@@ -94,8 +94,8 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;X86_ASM;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <FunctionLevelLinking>true</FunctionLevelLinking>
@@ -125,8 +125,8 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN64;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN64;NDEBUG;X86_ASM;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <FunctionLevelLinking>true</FunctionLevelLinking>
@@ -151,11 +151,15 @@
       <SuppressStartupBanner>true</SuppressStartupBanner>
       <OutputFile>$(OutDir)\WelsDecCore.bsc</OutputFile>
     </Bscmake>
+    <CustomBuild>
+      <Outputs>$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command>nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -184,7 +188,7 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN64;_DEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -208,176 +212,45 @@
       <SuppressStartupBanner>true</SuppressStartupBanner>
       <OutputFile>$(OutDir)\WelsDecCore.bsc</OutputFile>
     </Bscmake>
+    <CustomBuild>
+      <Command>nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs>$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <CustomBuild Include="..\..\..\decoder\core\asm\asm_inc.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\decoder\core\asm\block_add.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\cpuid.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\decoder\core\asm\dct.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\deblock.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\expand_picture.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\mb_copy.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\mc_chroma.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\mc_luma.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\memzero.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\..\common\logging.h" />
     <ClInclude Include="..\..\..\decoder\core\inc\as264_common.h" />
     <ClInclude Include="..\..\..\decoder\core\inc\au_parser.h" />
     <ClInclude Include="..\..\..\decoder\core\inc\bit_stream.h" />
@@ -419,6 +292,7 @@
     <ClInclude Include="..\..\..\decoder\core\inc\wels_const.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\common\logging.cpp" />
     <ClCompile Include="..\..\..\decoder\core\src\au_parser.cpp" />
     <ClCompile Include="..\..\..\decoder\core\src\bit_stream.cpp" />
     <ClCompile Include="..\..\..\decoder\core\src\cpu.cpp" />
@@ -440,6 +314,68 @@
     <ClCompile Include="..\..\..\decoder\core\src\decode_slice.cpp" />
     <ClCompile Include="..\..\..\decoder\core\src\decoder_core.cpp" />
     <ClCompile Include="..\..\..\decoder\core\src\utils.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\common\cpuid.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\common\deblock.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\common\expand_picture.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\common\mb_copy.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\common\mc_chroma.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\common\mc_luma.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32  -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64  -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+    </CustomBuild>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
--- a/codec/build/win32/dec/WelsDecCore_2010.vcxproj.filters
+++ b/codec/build/win32/dec/WelsDecCore_2010.vcxproj.filters
@@ -64,6 +64,9 @@
     <ClCompile Include="..\..\..\decoder\core\src\utils.cpp">
       <Filter>sources</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\common\logging.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\decoder\core\inc\as264_common.h">
@@ -183,39 +186,36 @@
     <ClInclude Include="..\..\..\decoder\core\inc\wels_common_basis.h">
       <Filter>headers</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\common\logging.h">
+      <Filter>headers</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="..\..\..\decoder\core\asm\asm_inc.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\decoder\core\asm\block_add.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\cpuid.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\decoder\core\asm\dct.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\deblock.asm">
+    <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\expand_picture.asm">
+    <CustomBuild Include="..\..\..\common\mc_luma.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
+    <CustomBuild Include="..\..\..\common\mc_chroma.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\mb_copy.asm">
+    <CustomBuild Include="..\..\..\common\mb_copy.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\mc_chroma.asm">
+    <CustomBuild Include="..\..\..\common\expand_picture.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\mc_luma.asm">
+    <CustomBuild Include="..\..\..\common\deblock.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\decoder\core\asm\memzero.asm">
+    <CustomBuild Include="..\..\..\common\cpuid.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
   </ItemGroup>
--- a/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
+++ b/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
@@ -107,7 +107,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\common;..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -156,7 +156,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\common;..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -204,7 +204,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\common;..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <MinimalRebuild>true</MinimalRebuild>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
--- a/codec/build/win32/dec/decConsole_2010.vcxproj
+++ b/codec/build/win32/dec/decConsole_2010.vcxproj
@@ -102,7 +102,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\common;..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -144,7 +144,7 @@
     <ClCompile>
       <Optimization>MaxSpeed</Optimization>
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\common;..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -227,7 +227,7 @@
     </Midl>
     <ClCompile>
       <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>..\..\..\common;..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -53,7 +53,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
-				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
@@ -101,9 +101,9 @@
 			/>
 		</Configuration>
 		<Configuration
-			Name="Release|Win32"
-			OutputDirectory=".\..\..\..\..\bin\win32\Release"
-			IntermediateDirectory=".\..\..\..\obj\encoder\core\Release"
+			Name="Debug|x64"
+			OutputDirectory=".\..\..\..\..\bin\win64\Debug"
+			IntermediateDirectory=".\..\..\..\obj\encoder\core\Debug"
 			ConfigurationType="4"
 			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
 			UseOfMFC="0"
@@ -127,22 +127,20 @@
 			/>
 			<Tool
 				Name="VCMIDLTool"
+				TargetEnvironment="3"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
-				Optimization="3"
-				InlineFunctionExpansion="2"
-				FavorSizeOrSpeed="1"
-				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
-				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
-				StringPooling="true"
-				RuntimeLibrary="2"
-				EnableFunctionLevelLinking="true"
-				PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Release/WelsEncCore.pch"
-				AssemblerListingLocation=".\..\..\..\obj\encoder\core\Release/"
-				ObjectFile=".\..\..\..\obj\encoder\core\Release/"
-				ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Release/"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
+				PreprocessorDefinitions="WIN64;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;X86_ASM;MT_ENABLED"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\core\Debug/"
+				ObjectFile=".\..\..\..\obj\encoder\core\Debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Debug/"
 				WarningLevel="3"
 				SuppressStartupBanner="true"
 				DebugInformationFormat="3"
@@ -152,7 +150,7 @@
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="NDEBUG"
+				PreprocessorDefinitions="_DEBUG"
 				Culture="1033"
 			/>
 			<Tool
@@ -160,7 +158,6 @@
 			/>
 			<Tool
 				Name="VCLibrarianTool"
-				AdditionalOptions="/LTCG"
 				OutputFile="$(OutDir)\welsecore.lib"
 				SuppressStartupBanner="true"
 			/>
@@ -184,9 +181,9 @@
 			/>
 		</Configuration>
 		<Configuration
-			Name="Debug|x64"
-			OutputDirectory=".\..\..\..\..\bin\win64\Debug"
-			IntermediateDirectory=".\..\..\..\obj\encoder\core\Debug"
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\encoder\core\Release"
 			ConfigurationType="4"
 			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
 			UseOfMFC="0"
@@ -210,20 +207,22 @@
 			/>
 			<Tool
 				Name="VCMIDLTool"
-				TargetEnvironment="3"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
-				PreprocessorDefinitions="WIN64;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="3"
-				PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch"
-				AssemblerListingLocation=".\..\..\..\obj\encoder\core\Debug/"
-				ObjectFile=".\..\..\..\obj\encoder\core\Debug/"
-				ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Debug/"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				FavorSizeOrSpeed="1"
+				WholeProgramOptimization="true"
+				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Release/WelsEncCore.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\core\Release/"
+				ObjectFile=".\..\..\..\obj\encoder\core\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Release/"
 				WarningLevel="3"
 				SuppressStartupBanner="true"
 				DebugInformationFormat="3"
@@ -233,7 +232,7 @@
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="_DEBUG"
+				PreprocessorDefinitions="NDEBUG"
 				Culture="1033"
 			/>
 			<Tool
@@ -241,6 +240,7 @@
 			/>
 			<Tool
 				Name="VCLibrarianTool"
+				AdditionalOptions="/LTCG"
 				OutputFile="$(OutDir)\welsecore.lib"
 				SuppressStartupBanner="true"
 			/>
@@ -299,7 +299,7 @@
 				FavorSizeOrSpeed="1"
 				WholeProgramOptimization="true"
 				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
-				PreprocessorDefinitions="WIN64;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED"
+				PreprocessorDefinitions="WIN64;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="true"
@@ -368,7 +368,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -377,7 +377,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -408,7 +408,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -417,7 +417,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -448,7 +448,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -457,7 +457,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -488,7 +488,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -497,7 +497,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -528,7 +528,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -537,7 +537,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -568,21 +568,21 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
 						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
+						PreprocessorDefinitions="OUPUT_REF_PIC"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
 						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions="OUPUT_REF_PIC"
+						PreprocessorDefinitions=""
 					/>
 				</FileConfiguration>
 				<FileConfiguration
@@ -608,7 +608,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -617,7 +617,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -648,7 +648,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -657,7 +657,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -688,7 +688,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -697,7 +697,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -728,7 +728,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -737,7 +737,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -768,7 +768,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -777,7 +777,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -808,7 +808,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -817,7 +817,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -852,7 +852,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -861,7 +861,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -892,7 +892,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -901,7 +901,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -932,7 +932,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -941,7 +941,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -972,7 +972,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -981,7 +981,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1012,7 +1012,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1021,7 +1021,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1052,7 +1052,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1061,7 +1061,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1096,7 +1096,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1105,7 +1105,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1140,7 +1140,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1149,7 +1149,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1180,7 +1180,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1189,7 +1189,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1220,7 +1220,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1229,7 +1229,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1260,7 +1260,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1269,7 +1269,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1300,7 +1300,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1309,7 +1309,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1340,7 +1340,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1349,7 +1349,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1380,7 +1380,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1389,7 +1389,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1420,7 +1420,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1429,7 +1429,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -1686,7 +1686,7 @@
 			Filter="*.asm;*.inc"
 			>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\asm_inc.asm"
+				RelativePath="..\..\..\encoder\core\asm\coeff.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1693,80 +1693,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Debug|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\..\encoder\core\asm\coeff.asm"
-				>
 				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
 					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\cpuid.asm"
+				RelativePath="..\..\..\common\cpuid.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1773,36 +1733,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1815,42 +1773,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\deblock.asm"
+				RelativePath="..\..\..\common\deblock.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1857,42 +1813,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\expand_picture.asm"
+				RelativePath="..\..\..\common\expand_picture.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1899,36 +1853,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1941,42 +1893,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\intra_pred_util.asm"
+				RelativePath="..\..\..\common\mb_copy.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1983,42 +1933,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\mb_copy.asm"
+				RelativePath="..\..\..\common\mc_chroma.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2025,42 +1973,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\mc_chroma.asm"
+				RelativePath="..\..\..\common\mc_luma.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2067,78 +2013,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
 					Name="Debug|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\..\encoder\core\asm\mc_luma.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
 					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2151,36 +2053,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2193,36 +2093,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2235,36 +2133,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2277,42 +2173,40 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\encoder\core\asm\vaa.asm"
+				RelativePath="..\..\..\common\vaa.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2319,36 +2213,34 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 				<FileConfiguration
 					Name="Release|x64"
-					ExcludedFromBuild="true"
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
--- a/codec/build/win32/enc/WelsEncCore_2010.vcxproj
+++ b/codec/build/win32/enc/WelsEncCore_2010.vcxproj
@@ -127,7 +127,7 @@
     <ClCompile>
       <Optimization>Disabled</Optimization>
       <AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN64;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN64;_DEBUG;X86_ASM;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch</PrecompiledHeaderOutputFile>
@@ -197,7 +197,7 @@
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
       <WholeProgramOptimization>true</WholeProgramOptimization>
       <AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN64;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN64;NDEBUG;X86_ASM;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <StringPooling>true</StringPooling>
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
       <FunctionLevelLinking>true</FunctionLevelLinking>
@@ -565,255 +565,154 @@
     <ClInclude Include="..\..\..\encoder\core\inc\wels_preprocess.h" />
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="..\..\..\encoder\core\asm\asm_inc.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\encoder\core\asm\coeff.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\cpuid.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\encoder\core\asm\dct.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\deblock.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\expand_picture.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred_util.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\mb_copy.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\mc_chroma.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\common\cpuid.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\mc_luma.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\common\deblock.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\common\expand_picture.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\common\mb_copy.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\common\mc_chroma.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\common\mc_luma.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\vaa.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+    <CustomBuild Include="..\..\..\common\vaa.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
     </CustomBuild>
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/codec/build/win32/enc/WelsEncCore_2010.vcxproj.filters
+++ b/codec/build/win32/enc/WelsEncCore_2010.vcxproj.filters
@@ -278,52 +278,46 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="..\..\..\encoder\core\asm\asm_inc.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\encoder\core\asm\coeff.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\cpuid.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
     <CustomBuild Include="..\..\..\encoder\core\asm\dct.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\deblock.asm">
+    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\expand_picture.asm">
+    <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
+    <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred_util.asm">
+    <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\mb_copy.asm">
+    <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\mc_chroma.asm">
+    <CustomBuild Include="..\..\..\common\mc_luma.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\mc_luma.asm">
+    <CustomBuild Include="..\..\..\common\mc_chroma.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
+    <CustomBuild Include="..\..\..\common\mb_copy.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
+    <CustomBuild Include="..\..\..\common\expand_picture.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
+    <CustomBuild Include="..\..\..\common\deblock.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
+    <CustomBuild Include="..\..\..\common\cpuid.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
-    <CustomBuild Include="..\..\..\encoder\core\asm\vaa.asm">
+    <CustomBuild Include="..\..\..\common\vaa.asm">
       <Filter>ASM</Filter>
     </CustomBuild>
   </ItemGroup>
--- a/codec/build/win32/enc/WelsEncoder_2008.sln
+++ b/codec/build/win32/enc/WelsEncoder_2008.sln
@@ -17,7 +17,7 @@
 		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
 	EndProjectSection
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "..\..\..\..\processing\build\win32\WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "..\..\..\processing\build\win32\WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- a/codec/build/win32/enc/WelsEncoder_2010.sln
+++ b/codec/build/win32/enc/WelsEncoder_2010.sln
@@ -10,7 +10,7 @@
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "encConsole_2010", "encConsole_2010.vcxproj", "{8509E2A8-2CBD-49E2-B564-3EFF1E927459}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "..\..\..\..\processing\build\win32\WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "..\..\..\processing\build\win32\WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- /dev/null
+++ b/codec/common/asm_inc.asm
@@ -1,0 +1,509 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  sse2inc.asm
+;*
+;*  Abstract
+;*      macro and constant
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1
+	%define MOVDQ movdqa
+%else
+	%define MOVDQ movdqu
+%endif
+
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+
+;***********************************************************************
+; Macros
+;***********************************************************************
+
+DEFAULT REL
+
+%ifdef WIN64 ; Windows x64 ;************************************
+
+BITS 64
+
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + push_num*8 + 40]
+%define arg6 [rsp + push_num*8 + 48]
+%define arg7 [rsp + push_num*8 + 56]
+%define arg8 [rsp + push_num*8 + 64]
+%define arg9 [rsp + push_num*8 + 72]
+%define arg10 [rsp + push_num*8 + 80]
+
+%define r0 rcx
+%define r1 rdx
+%define r2 r8
+%define r3 r9
+%define r4 rax
+%define r5 r10
+%define r6 r11
+%define r7 rsp
+
+%define r0d ecx
+%define r1d edx
+%define r2d r8d
+%define r3d r9d
+%define r4d eax
+%define r5d r10d
+%define r6d r11d
+
+%define r0w  cx
+%define r1w  dx
+%define r2w  r8w
+%define r3w  r9w
+
+%define r0b  cl
+%define r1b  dl
+%define r2b  r8l
+%define r3b  r9l
+
+%define  PUSHRFLAGS     pushfq
+%define  POPRFLAGS      popfq
+%define  retrq          rax
+%define  retrd          eax
+
+%elifdef UNIX64 ; Unix x64 ;************************************
+
+BITS 64
+
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%define arg6 r9
+%define arg7 [rsp + push_num*8 + 8]
+%define arg8 [rsp + push_num*8 + 16]
+%define arg9 [rsp + push_num*8 + 24]
+%define arg10 [rsp + push_num*8 + 32]
+
+%define r0 rdi
+%define r1 rsi
+%define r2 rdx
+%define r3 rcx
+%define r4 r8
+%define r5 r9
+%define r6 r10
+%define r7 rsp
+
+%define r0d edi
+%define r1d esi
+%define r2d edx
+%define r3d ecx
+%define r4d r8d
+%define r5d r9d
+%define r6d r10d
+
+%define r0w  di
+%define r1w  si
+%define r2w  dx
+%define r3w  cx
+
+%define r0b  dil
+%define r1b  sil
+%define r2b  dl
+%define r3b  cl
+
+%define  PUSHRFLAGS     pushfq
+%define  POPRFLAGS      popfq
+%define  retrq          rax
+%define  retrd          eax 
+
+%elifdef X86_32 ; X86_32 ;************************************
+
+BITS 32
+
+%define arg1 [esp + push_num*4 + 4]
+%define arg2 [esp + push_num*4 + 8]
+%define arg3 [esp + push_num*4 + 12]
+%define arg4 [esp + push_num*4 + 16]
+%define arg5 [esp + push_num*4 + 20]
+%define arg6 [esp + push_num*4 + 24]
+%define arg7 [esp + push_num*4 + 28]
+%define arg8 [esp + push_num*4 + 32]
+%define arg9 [esp + push_num*4 + 36]
+%define arg10 [esp + push_num*4 + 40]
+
+%define r0 eax
+%define r1 ecx
+%define r2 edx
+%define r3 ebx
+%define r4 esi
+%define r5 edi
+%define r6 ebp
+%define r7 esp
+
+%define r0d eax
+%define r1d ecx
+%define r2d edx
+%define r3d ebx
+%define r4d esi
+%define r5d edi
+%define r6d ebp
+
+%define r0w ax
+%define r1w cx
+%define r2w dx
+%define r3w bx
+
+%define r0b al
+%define r1b cl
+%define r2b dl
+%define r3b bl
+
+%define  PUSHRFLAGS     pushfd
+%define  POPRFLAGS      popfd
+%define  retrq          eax      ; 32 bit mode do not support 64 bits regesters
+%define  retrd          eax
+
+%endif
+
+%macro LOAD_PARA 2
+    mov %1, %2
+%endmacro
+
+%macro LOAD_1_PARA 0
+    %ifdef X86_32
+	mov r0, [esp + push_num*4 + 4]
+    %endif
+%endmacro
+
+%macro LOAD_2_PARA 0
+    %ifdef X86_32
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+    %endif
+%endmacro
+
+%macro LOAD_3_PARA 0
+    %ifdef X86_32
+        mov r0, [esp + push_num*4 + 4]
+	mov r1, [esp + push_num*4 + 8]
+	mov r2, [esp + push_num*4 + 12]
+    %endif
+%endmacro
+
+%macro LOAD_4_PARA 0
+    %ifdef X86_32
+        push r3
+        %assign  push_num push_num+1	
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+    %endif
+%endmacro
+
+%macro LOAD_5_PARA 0
+    %ifdef X86_32
+        push r3
+        push r4
+        %assign  push_num push_num+2	
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+        mov r4, [esp + push_num*4 + 20]
+    %elifdef WIN64
+        mov r4, [rsp + push_num*8 + 40]
+    %endif
+%endmacro
+
+%macro LOAD_6_PARA 0
+    %ifdef X86_32
+	push r3
+        push r4
+        push r5
+        %assign  push_num push_num+3	
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+        mov r4, [esp + push_num*4 + 20]
+        mov r5, [esp + push_num*4 + 24]
+    %elifdef WIN64
+        mov r4, [rsp + push_num*8 + 40]
+        mov r5, [rsp + push_num*8 + 48]
+    %endif
+%endmacro
+
+%macro LOAD_7_PARA 0
+    %ifdef X86_32
+        push r3
+        push r4
+        push r5
+        push r6
+        %assign  push_num push_num+4	
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+        mov r4, [esp + push_num*4 + 20]
+        mov r5, [esp + push_num*4 + 24]
+        mov r6, [esp + push_num*4 + 28]
+    %elifdef WIN64
+        mov r4, [rsp + push_num*8 + 40]
+        mov r5, [rsp + push_num*8 + 48]
+        mov r6, [rsp + push_num*8 + 56]
+    %elifdef UNIX64
+        mov r6, [rsp + push_num*8 + 8]
+    %endif
+%endmacro
+
+
+
+%macro LOAD_4_PARA_POP 0
+    %ifdef X86_32
+	pop r3
+    %endif
+%endmacro
+
+%macro LOAD_5_PARA_POP 0
+    %ifdef X86_32
+        pop r4
+	pop r3
+    %endif
+%endmacro
+
+%macro LOAD_6_PARA_POP 0
+    %ifdef X86_32
+        pop r5
+  	pop r4
+ 	pop r3
+    %endif
+%endmacro
+
+%macro LOAD_7_PARA_POP 0
+    %ifdef X86_32
+        pop r6
+        pop r5
+        pop r4
+        pop r3
+    %endif
+%endmacro
+
+%macro SIGN_EXTENTION 2
+    %ifndef X86_32
+            movsx %1, %2
+    %endif
+%endmacro
+ 
+%macro WELS_EXTERN 1
+    %ifdef PREFIX
+        global _%1
+        %define %1 _%1
+    %else
+        global %1
+    %endif
+%endmacro
+
+%macro WELS_AbsW 2
+	pxor        %2, %2
+    psubw       %2, %1
+    pmaxsw      %1, %2
+%endmacro
+
+%macro MMX_XSwap  4
+    movq		%4, %2
+    punpckh%1   %4, %3
+    punpckl%1   %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+    MMX_XSwap wd, %1, %2, %5
+    MMX_XSwap wd, %3, %4, %2
+    MMX_XSwap dq, %1, %3, %4
+    MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+    movdqa      %4, %2
+    punpckl%1   %2, %3
+    punpckh%1   %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+    SSE2_XSawp dq,  %1, %2, %5
+    SSE2_XSawp dq,  %3, %4, %2
+    SSE2_XSawp qdq, %1, %3, %4
+    SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
+%macro SSE2_TransTwo4x4W 5
+    SSE2_XSawp wd,  %1, %2, %5
+    SSE2_XSawp wd,  %3, %4, %2
+    SSE2_XSawp dq,  %1, %3, %4
+    SSE2_XSawp dq,  %5, %2, %3
+    SSE2_XSawp qdq, %1, %5, %2
+    SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in:  m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+	movdqa	%9,	%8
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%9, %4
+	SSE2_XSawp bw,  %7, %6, %4
+
+	SSE2_XSawp wd,  %1, %3, %6
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %3
+	SSE2_XSawp wd,  %7, %4, %3
+
+	SSE2_XSawp dq,  %1, %5, %4
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %5
+	SSE2_XSawp dq,  %7, %3, %5
+
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %1
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+    movq         %1, %4
+    punpcklbw    %1, %3
+    movq         %2, %5
+    punpcklbw    %2, %3
+    psubw        %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+	movdqa  %3, %2
+    paddw   %2, %1
+    psubw   %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	mov %3h, %3l
+	movd %1, e%3x		; i.e, 1% = eax (=b0)
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
+
+;copy a dw into a xmm for 8 times
+%macro  SSE2_Copy8Times 2
+		movd	%1, %2
+		punpcklwd %1, %1
+		pshufd	%1,	%1,	0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro  SSE2_Copy16Times 2
+		movd		%1, %2
+		pshuflw		%1, %1, 0
+		punpcklqdq	%1, %1
+		packuswb	%1,	%1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro	WELS_Zero 1
+	pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+	pcmpeqw %1,%1
+	psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/codec/common/cpuid.asm
@@ -1,0 +1,220 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	cpu_mmx.asm
+;*
+;*  Abstract
+;*		verify cpuid feature support and cpuid detection
+;*
+;*  History
+;*      04/29/2009	Created
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+WELS_EXTERN WelsCPUIdVerify
+ALIGN 16
+;******************************************************************************************
+;   int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WelsCPUIdVerify:
+    push    r1
+    PUSHRFLAGS
+    PUSHRFLAGS
+
+    pop      r1
+    mov      eax, r1d
+    xor      eax, 00200000h
+    xor      eax, r1d
+    POPRFLAGS
+    pop      r1
+    ret
+
+WELS_EXTERN WelsCPUId
+ALIGN 16
+;****************************************************************************************************
+;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
+;****************************************************************************************************
+%ifdef       WIN64
+
+WelsCPUId:
+    push     rbx        
+    push     rdx    
+ 
+    mov      eax,     ecx
+    cpuid  
+    mov      [r9],    ecx
+    mov      [r8],    ebx
+    mov      rcx,    [rsp + 2*8 + 40]        
+    mov      [rcx],   edx
+    pop      rdx 
+    mov      [rdx],   eax
+
+    pop      rbx
+    ret
+
+%elifdef     UNIX64
+WelsCPUId:
+    push     rbx
+    push     rcx
+    push     rdx
+
+    mov      eax,     edi    
+    cpuid
+    mov      [r8],    edx
+    pop      rdx    
+    pop      r8
+    mov      [r8],   ecx
+    mov      [rdx],   ebx
+    mov      [rsi],   eax
+
+    pop      rbx
+    ret
+
+%elifdef     X86_32
+
+WelsCPUId:
+    push	ebx
+    push	edi
+
+    mov     eax, [esp+12]	; operating index
+    cpuid					; cpuid
+
+    ; processing various information return
+    mov     edi, [esp+16]
+    mov     [edi], eax
+    mov     edi, [esp+20]
+    mov     [edi], ebx
+    mov     edi, [esp+24]
+    mov     [edi], ecx
+    mov     edi, [esp+28]
+    mov     [edi], edx
+
+    pop	    edi
+    pop     ebx
+    ret
+
+%endif
+
+WELS_EXTERN WelsCPUSupportAVX
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportAVX:
+%ifdef     WIN64
+        mov   eax,    ecx
+        mov   ecx,    edx
+%elifdef   UNIX64
+        mov eax, edi
+        mov ecx, esi
+%else 
+        mov eax, [esp+4]
+        mov ecx, [esp+8]  
+%endif
+
+        ; refer to detection of AVX addressed in INTEL AVX manual document
+        and ecx, 018000000H
+        cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
+        jne avx_not_supported
+        ; processor supports AVX instructions and XGETBV is enabled by OS
+        mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
+        XGETBV                                  ; result in EDX:EAX
+        and eax, 06H
+        cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
+        jne avx_not_supported
+        mov eax, 1
+        ret
+avx_not_supported:
+        mov eax, 0
+        ret
+
+
+WELS_EXTERN  WelsCPUSupportFMA
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportFMA:
+%ifdef     WIN64
+        mov   eax,   ecx
+        mov   ecx,   edx
+%elifdef   UNIX64
+        mov   eax,   edi
+        mov   ecx,   esi
+%else
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+%endif
+	; refer to detection of FMA addressed in INTEL AVX manual document
+	and ecx, 018001000H
+	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
+	jne fma_not_supported
+	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne fma_not_supported
+	mov eax, 1
+	ret
+fma_not_supported:
+	mov eax, 0
+	ret
+
+WELS_EXTERN WelsEmms
+ALIGN 16
+;******************************************************************************************
+;   void WelsEmms()
+;******************************************************************************************
+WelsEmms:
+	emms	; empty mmx technology states
+	ret
+
+
+
--- /dev/null
+++ b/codec/common/deblock.asm
@@ -1,0 +1,5325 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+ALIGN   16
+FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4
+
+
+SECTION .text
+
+%ifdef  WIN64 
+
+
+WELS_EXTERN   DeblockLumaLt4V_sse2
+
+DeblockLumaLt4V_sse2:
+  push        rbp      
+  mov         r11,[esp + 16 + 20h]  ; pTC                                                    
+  sub         rsp,1B0h                                                       
+  lea         rbp,[rsp+20h]                                                  
+  movd        xmm4,r8d                                                                                                  
+  movd        xmm2,r9d                                                       
+  mov         qword [rbp+180h],r12                                       
+  mov         r10,rcx                                                        
+  movsxd      r12,edx                                                        
+  add         edx,edx                                                        
+  movsxd      rdx,edx                                                        
+  sub         r10,r12                                                        
+  movsx       r8d,byte [r11]                                             
+  pxor        xmm3,xmm3                                                      
+  punpcklwd   xmm2,xmm2                                                      
+  movaps      [rbp+50h],xmm14                                    
+  lea         rax,[r12+r12*2]                                                
+  movdqa      xmm14,[rdx+rcx]                                    
+  neg         rax                                                            
+  pshufd      xmm0,xmm2,0                                                    
+  movd        xmm2,r8d                                                       
+  movsx       edx,byte [r11+1]                                           
+  movsx       r8d,byte [r11+2]                                           
+  movsx       r11d,byte [r11+3]                                          
+  movaps      [rbp+70h],xmm12                                    
+  movd        xmm1,edx                                                       
+  movaps      [rbp+80h],xmm11                                    
+  movd        xmm12,r8d                                                      
+  movd        xmm11,r11d                                                     
+  movdqa      xmm5, [rax+rcx]                                     
+  lea         rax,[r12+r12]                                                  
+  punpcklwd   xmm12,xmm12                                                    
+  neg         rax                                                            
+  punpcklwd   xmm11,xmm11                                                    
+  movaps      [rbp],xmm8                                         
+  movdqa      xmm8, [r10]                                         
+  punpcklwd   xmm2,xmm2                                                      
+  punpcklwd   xmm1,xmm1                                                      
+  punpcklqdq  xmm12,xmm12                                                    
+  punpcklqdq  xmm11,xmm11                                                    
+  punpcklqdq  xmm2,xmm2                                                      
+  punpcklqdq  xmm1,xmm1                                                      
+  shufps      xmm12,xmm11,88h                                                
+  movdqa      xmm11,xmm8                                                     
+  movaps      [rbp+30h],xmm9                                     
+  movdqa      xmm9,[rcx]                                         
+  shufps      xmm2,xmm1,88h                                                  
+  movdqa      xmm1,xmm5                                                      
+  punpcklbw   xmm11,xmm3                                                     
+  movaps      [rbp+20h],xmm6                                     
+  movaps      [rbp+60h],xmm13                                    
+  movdqa      xmm13,xmm11                                                    
+  movaps      [rbp+90h],xmm10                                    
+  movdqa      xmm10,xmm9                                                     
+  movdqa      xmm6,[rax+rcx]                                     
+  punpcklbw   xmm1,xmm3                                                      
+  movaps      [rbp+0A0h],xmm12                                   
+  psubw       xmm13,xmm1                                                     
+  movaps      [rbp+40h],xmm15                                    
+  movdqa      xmm15,xmm14                                                    
+  movaps      [rbp+10h],xmm7                                     
+  movdqa      xmm7,xmm6                                                      
+  punpcklbw   xmm10,xmm3                                                     
+  movdqa      xmm12,[r12+rcx]                                    
+  punpcklbw   xmm7,xmm3                                                      
+  punpcklbw   xmm12,xmm3                                                     
+  punpcklbw   xmm15,xmm3                                                     
+  pabsw       xmm3,xmm13                                                     
+  movdqa      xmm13,xmm10                                                    
+  psubw       xmm13,xmm15                                                    
+  movdqa      [rbp+0F0h],xmm15                                   
+  pabsw       xmm15,xmm13                                                    
+  movdqa      xmm13,xmm11                                                    
+  movdqa      [rbp+0B0h],xmm1                                    
+  movdqa      xmm1,xmm0                                                      
+  pavgw       xmm13,xmm10                                                    
+  pcmpgtw     xmm1,xmm3                                                      
+  movdqa      [rbp+120h],xmm13                                   
+  movaps      xmm13,xmm2                                                     
+  punpcklwd   xmm4,xmm4                                                      
+  movdqa      xmm3,xmm0                                                      
+  movdqa      [rbp+100h],xmm1                                    
+  psubw       xmm13,xmm1                                                     
+  movdqa      xmm1,xmm10                                                     
+  pcmpgtw     xmm3,xmm15                                                     
+  pshufd      xmm4,xmm4,0                                                    
+  psubw       xmm1,xmm11                                                     
+  movdqa      [rbp+0D0h],xmm10                                   
+  psubw       xmm13,xmm3                                                     
+  movdqa      [rbp+110h],xmm3                                    
+  pabsw       xmm15,xmm1                                                     
+  movdqa      xmm3,xmm4                                                      
+  psubw       xmm10,xmm12                                                    
+  pcmpgtw     xmm3,xmm15                                                     
+  pabsw       xmm15,xmm10                                                    
+  movdqa      xmm10,xmm0                                                     
+  psllw       xmm1,2                                                         
+  movdqa      [rbp+0C0h],xmm11                                   
+  psubw       xmm11,xmm7                                                     
+  pcmpgtw     xmm10,xmm15                                                    
+  pabsw       xmm11,xmm11                                                    
+  movdqa      xmm15,xmm0                                                     
+  pand        xmm3,xmm10                                                     
+  pcmpgtw     xmm15,xmm11                                                    
+  movaps      xmm11,xmm2                                                     
+  pxor        xmm10,xmm10                                                    
+  pand        xmm3,xmm15                                                     
+  pcmpgtw     xmm11,xmm10                                                    
+  pcmpeqw     xmm10,xmm2                                                     
+  por         xmm11,xmm10                                                    
+  pand        xmm3,xmm11                                                     
+  movdqa      xmm11,xmm7                                                     
+  psubw       xmm11,xmm12                                                    
+  pxor        xmm15,xmm15                                                    
+  paddw       xmm11,xmm1                                                     
+  psubw       xmm15,xmm13                                                    
+  movdqa      [rbp+0E0h],xmm12                                   
+  paddw       xmm11,[FOUR_16B_SSE2] 
+  pxor        xmm12,xmm12                                                    
+  psraw       xmm11,3                                                        
+  punpckhbw   xmm8,xmm12                                                     
+  pmaxsw      xmm15,xmm11                                                    
+  punpckhbw   xmm5,xmm12                                                     
+  movdqa      xmm11,xmm8                                                     
+  pminsw      xmm13,xmm15                                                    
+  psubw       xmm11,xmm5                                                     
+  punpckhbw   xmm9,xmm12                                                     
+  pand        xmm13,xmm3                                                     
+  movdqa      [rbp+130h],xmm13                                   
+  pabsw       xmm13,xmm11                                                    
+  punpckhbw   xmm14,xmm12                                                    
+  movdqa      xmm11,xmm9                                                     
+  psubw       xmm11,xmm14                                                    
+  movdqa      xmm15,xmm0                                                     
+  movdqa      [rbp+140h],xmm14                                   
+  pabsw       xmm14,xmm11                                                    
+  movdqa      xmm11,xmm8                                                     
+  pcmpgtw     xmm15,xmm14                                                    
+  movdqa      xmm1,[r12+rcx]                                     
+  pavgw       xmm11,xmm9                                                     
+  movdqa      [rbp+170h],xmm11                                   
+  movdqa      xmm10,xmm9                                                     
+  punpckhbw   xmm6,xmm12                                                     
+  psubw       xmm10,xmm8                                                     
+  punpckhbw   xmm1,xmm12                                                     
+  movdqa      xmm12,xmm0                                                     
+  movaps      xmm11,[rbp+0A0h]                                   
+  pcmpgtw     xmm12,xmm13                                                    
+  movaps      xmm13,xmm11                                                    
+  psubw       xmm13,xmm12                                                    
+  movdqa      [rbp+160h],xmm15                                   
+  psubw       xmm13,xmm15                                                    
+  movdqa      xmm15,xmm9                                                     
+  psubw       xmm15,xmm1                                                     
+  movdqa      [rbp+150h],xmm12                                   
+  pabsw       xmm12,xmm10                                                    
+  pabsw       xmm14,xmm15                                                    
+  movdqa      xmm15,xmm8                                                     
+  pcmpgtw     xmm4,xmm12                                                     
+  movdqa      xmm12,xmm0                                                     
+  psubw       xmm15,xmm6                                                     
+  pcmpgtw     xmm12,xmm14                                                    
+  pabsw       xmm14,xmm15                                                    
+  psllw       xmm10,2                                                        
+  pcmpgtw     xmm0,xmm14                                                     
+  movdqa      xmm14,xmm6                                                     
+  psubw       xmm14,xmm1                                                     
+  pand        xmm4,xmm12                                                     
+  paddw       xmm14,xmm10                                                    
+  pand        xmm4,xmm0                                                      
+  paddw       xmm14,[FOUR_16B_SSE2] 
+  pxor        xmm15,xmm15                                                    
+  movaps      xmm12,xmm11                                                    
+  psubw       xmm15,xmm13                                                    
+  pxor        xmm0,xmm0                                                      
+  psraw       xmm14,3                                                        
+  pcmpgtw     xmm12,xmm0                                                     
+  pcmpeqw     xmm0,xmm11                                                     
+  pmaxsw      xmm15,xmm14                                                    
+  por         xmm12,xmm0                                                     
+  movdqa      xmm0,[rbp+120h]                                    
+  pminsw      xmm13,xmm15                                                    
+  movdqa      xmm15,[rbp+0B0h]                                   
+  movdqa      xmm10,xmm7                                                     
+  pand        xmm4,xmm12                                                     
+  paddw       xmm15,xmm0                                                     
+  pxor        xmm12,xmm12                                                    
+  paddw       xmm10,xmm7                                                     
+  movdqa      xmm14,xmm12                                                    
+  psubw       xmm15,xmm10                                                    
+  psubw       xmm14,xmm2                                                     
+  psraw       xmm15,1                                                        
+  pmaxsw      xmm15,xmm14                                                    
+  movdqa      xmm10,xmm6                                                     
+  pminsw      xmm15,xmm2                                                     
+  paddw       xmm10,xmm6                                                     
+  pand        xmm15,xmm3                                                     
+  psubw       xmm12,xmm11                                                    
+  pand        xmm15,[rbp+100h]                                   
+  pand        xmm13,xmm4                                                     
+  paddw       xmm7,xmm15                                                     
+  paddw       xmm8,xmm13                                                     
+  movdqa      xmm15,[rbp+170h]                                   
+  psubw       xmm9,xmm13                                                     
+  paddw       xmm5,xmm15                                                     
+  psubw       xmm5,xmm10                                                     
+  psraw       xmm5,1                                                         
+  pmaxsw      xmm5,xmm12                                                     
+  pminsw      xmm5,xmm11                                                     
+  pand        xmm5,xmm4                                                      
+  pand        xmm5,[rbp+150h]                                    
+  paddw       xmm6,xmm5                                                      
+  movdqa      xmm5,[rbp+0C0h]                                    
+  packuswb    xmm7,xmm6                                                      
+  movdqa      xmm6,[rbp+130h]                                    
+  paddw       xmm5,xmm6                                                      
+  packuswb    xmm5,xmm8                                                      
+  movdqa      xmm8,[rbp+0D0h]                                    
+  psubw       xmm8,xmm6                                                      
+  movdqa      xmm6,[rbp+0F0h]                                    
+  paddw       xmm6,xmm0                                                      
+  movdqa      xmm0,[rbp+0E0h]                                    
+  packuswb    xmm8,xmm9                                                      
+  movdqa      xmm9,xmm0                                                      
+  paddw       xmm9,xmm0                                                      
+  psubw       xmm6,xmm9                                                      
+  psraw       xmm6,1                                                         
+  pmaxsw      xmm14,xmm6                                                     
+  pminsw      xmm2,xmm14                                                     
+  pand        xmm2,xmm3                                                      
+  pand        xmm2,[rbp+110h]                                    
+  paddw       xmm0,xmm2                                                      
+  movdqa      xmm2,[rbp+140h]                                    
+  paddw       xmm2,xmm15                                                     
+  movdqa      xmm15,xmm1                                                     
+  paddw       xmm15,xmm1                                                     
+  psubw       xmm2,xmm15                                                     
+  psraw       xmm2,1                                                         
+  pmaxsw      xmm12,xmm2                                                     
+  pminsw      xmm11,xmm12                                                    
+  pand        xmm11,xmm4                                                     
+  pand        xmm11,[rbp+160h]                                   
+  paddw       xmm1,xmm11                                                     
+  movdqa      [rax+rcx],xmm7                                     
+  movdqa      [r10],xmm5                                         
+  packuswb    xmm0,xmm1                                                      
+  movdqa      [rcx],xmm8                                         
+  movdqa      [r12+rcx],xmm0                                                                        
+  mov         r12,qword [rbp+180h]                                       
+  lea         rsp,[rbp+190h]                                                 
+  pop         rbp                                                            
+  ret                                                                        
+
+
+WELS_EXTERN   DeblockLumaEq4V_sse2
+
+ALIGN  16
+DeblockLumaEq4V_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  push        rbp  
+  push        rsi  
+  push        rdi  
+  sub         rsp,1D8h 
+  movaps      [rax-38h],xmm6 
+  movaps      [rax-48h],xmm7 
+  movaps      [rax-58h],xmm8 
+  pxor        xmm1,xmm1 
+  movsxd      r10,edx 
+  mov         rbp,rcx 
+  mov         r11d,r8d 
+  mov         rdx,rcx 
+  mov         rdi,rbp 
+  mov         rbx,rbp 
+  movdqa      xmm5,[rbp] 
+  movaps      [rax-68h],xmm9 
+  movaps      [rax-78h],xmm10 
+  punpcklbw   xmm5,xmm1 
+  movaps      [rax-88h],xmm11 
+  movaps      [rax-98h],xmm12 
+  movaps      [rax-0A8h],xmm13 
+  movaps      [rax-0B8h],xmm14 
+  movdqa      xmm14,[r10+rbp] 
+  movaps      [rax-0C8h],xmm15 
+  lea         eax,[r10*4] 
+  movsxd      r8,eax 
+  lea         eax,[r10+r10*2] 
+  movsxd      rcx,eax 
+  lea         eax,[r10+r10] 
+  sub         rdx,r8 
+  punpcklbw   xmm14,xmm1 
+  movdqa      [rsp+90h],xmm5 
+  movdqa      [rsp+30h],xmm14 
+  movsxd      rsi,eax 
+  movsx       eax,r11w 
+  sub         rdi,rcx 
+  sub         rbx,rsi 
+  mov         r8,rbp 
+  sub         r8,r10 
+  movd        xmm0,eax 
+  movsx       eax,r9w 
+  movdqa      xmm12,[rdi] 
+  movdqa      xmm6, [rsi+rbp] 
+  movdqa      xmm13,[rbx] 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm11,xmm0,0 
+  punpcklbw   xmm13,xmm1 
+  punpcklbw   xmm6,xmm1 
+  movdqa      xmm8,[r8] 
+  movd        xmm0,eax 
+  movdqa      xmm10,xmm11 
+  mov         eax,2 
+  punpcklbw   xmm8,xmm1 
+  punpcklbw   xmm12,xmm1 
+  cwde             
+  punpcklwd   xmm0,xmm0 
+  psraw       xmm10,2 
+  movdqa      xmm1,xmm8 
+  movdqa      [rsp+0F0h],xmm13 
+  movdqa      [rsp+0B0h],xmm8 
+  pshufd      xmm7,xmm0,0 
+  psubw       xmm1,xmm13 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm4,xmm7 
+  movdqa      xmm2,xmm7 
+  psubw       xmm0,xmm8 
+  pabsw       xmm3,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm5 
+  movdqa      [rsp+40h],xmm7 
+  movdqa      [rsp+60h],xmm6 
+  pcmpgtw     xmm4,xmm0 
+  psubw       xmm1,xmm14 
+  pabsw       xmm0,xmm1 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm4,xmm2 
+  movdqa      xmm0,xmm11 
+  pcmpgtw     xmm0,xmm3 
+  pand        xmm4,xmm0 
+  movd        xmm0,eax 
+  movdqa      [rsp+20h],xmm4 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm2,xmm0,0 
+  paddw       xmm10,xmm2 
+  movdqa      [rsp+0A0h],xmm2 
+  movdqa      xmm15,xmm7 
+  pxor        xmm4,xmm4 
+  movdqa      xmm0,xmm8 
+  psubw       xmm0,xmm12 
+  mov         eax,4 
+  pabsw       xmm0,xmm0 
+  movdqa      xmm1,xmm10 
+  cwde             
+  pcmpgtw     xmm15,xmm0 
+  pcmpgtw     xmm1,xmm3 
+  movdqa      xmm3,xmm7 
+  movdqa      xmm7,[rdx] 
+  movdqa      xmm0,xmm5 
+  psubw       xmm0,xmm6 
+  pand        xmm15,xmm1 
+  punpcklbw   xmm7,xmm4 
+  movdqa      xmm9,xmm15 
+  pabsw       xmm0,xmm0 
+  psllw       xmm7,1 
+  pandn       xmm9,xmm12 
+  pcmpgtw     xmm3,xmm0 
+  paddw       xmm7,xmm12 
+  movd        xmm0,eax 
+  pand        xmm3,xmm1 
+  paddw       xmm7,xmm12 
+  punpcklwd   xmm0,xmm0 
+  paddw       xmm7,xmm12 
+  pshufd      xmm1,xmm0,0 
+  paddw       xmm7,xmm13 
+  movdqa      xmm0,xmm3 
+  pandn       xmm0,xmm6 
+  paddw       xmm7,xmm8 
+  movdqa      [rsp+70h],xmm1 
+  paddw       xmm7,xmm5 
+  movdqa      [rsp+120h],xmm0 
+  movdqa      xmm0,[rcx+rbp] 
+  punpcklbw   xmm0,xmm4 
+  paddw       xmm7,xmm1 
+  movdqa      xmm4,xmm15 
+  psllw       xmm0,1 
+  psraw       xmm7,3 
+  paddw       xmm0,xmm6 
+  pand        xmm7,xmm15 
+  paddw       xmm0,xmm6 
+  paddw       xmm0,xmm6 
+  paddw       xmm0,xmm14 
+  movdqa      xmm6,xmm15 
+  paddw       xmm0,xmm5 
+  pandn       xmm6,xmm13 
+  paddw       xmm0,xmm8 
+  paddw       xmm0,xmm1 
+  psraw       xmm0,3 
+  movdqa      xmm1,xmm12 
+  paddw       xmm1,xmm13 
+  pand        xmm0,xmm3 
+  movdqa      [rsp+100h],xmm0 
+  movdqa      xmm0,xmm8 
+  paddw       xmm0,xmm5 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm3 
+  paddw       xmm1,xmm2 
+  psraw       xmm1,2 
+  pandn       xmm0,xmm14 
+  pand        xmm4,xmm1 
+  movdqa      [rsp+0E0h],xmm0 
+  movdqa      xmm0,xmm5 
+  paddw       xmm0,xmm8 
+  movdqa      xmm1,[rsp+60h] 
+  paddw       xmm1,xmm14 
+  movdqa      xmm14,xmm3 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm8 
+  paddw       xmm0,[rsp+30h] 
+  paddw       xmm1,xmm2 
+  psraw       xmm1,2 
+  pand        xmm14,xmm1 
+  movdqa      xmm1,xmm13 
+  paddw       xmm1,xmm13 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm2 
+  psraw       xmm1,2 
+  movdqa      xmm0,[rsp+30h] 
+  movdqa      xmm2,xmm13 
+  movdqa      xmm5,xmm15 
+  paddw       xmm0,[rsp+70h] 
+  pandn       xmm5,xmm1 
+  paddw       xmm2,xmm8 
+  movdqa      xmm8,[rsp+90h] 
+  movdqa      xmm1,xmm12 
+  paddw       xmm2,xmm8 
+  psllw       xmm2,1 
+  paddw       xmm2,xmm0 
+  paddw       xmm1,xmm2 
+  movdqa      xmm0,xmm8 
+  movdqa      xmm8,xmm3 
+  movdqa      xmm2,[rsp+30h] 
+  paddw       xmm0,xmm13 
+  psraw       xmm1,3 
+  pand        xmm15,xmm1 
+  movdqa      xmm1,xmm2 
+  paddw       xmm1,xmm2 
+  paddw       xmm2,[rsp+90h] 
+  paddw       xmm2,[rsp+0B0h] 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm13 
+  movdqa      xmm13,[r8] 
+  paddw       xmm0, [rsp+70h] 
+  paddw       xmm1, [rsp+0A0h] 
+  psllw       xmm2,1 
+  paddw       xmm2,xmm0 
+  psraw       xmm1,2 
+  movdqa      xmm0, [rdi] 
+  pandn       xmm8,xmm1 
+  movdqa      xmm1, [rsp+60h] 
+  paddw       xmm1,xmm2 
+  movdqa      xmm2, [rbx] 
+  psraw       xmm1,3 
+  pand        xmm3,xmm1 
+  movdqa      xmm1, [rbp] 
+  movdqa      [rsp+0D0h],xmm3 
+  pxor        xmm3,xmm3 
+  punpckhbw   xmm0,xmm3 
+  punpckhbw   xmm1,xmm3 
+  punpckhbw   xmm13,xmm3 
+  movdqa      [rsp+0C0h],xmm0 
+  movdqa      xmm0,[r10+rbp] 
+  movdqa      [rsp],xmm1 
+  punpckhbw   xmm0,xmm3 
+  punpckhbw   xmm2,xmm3 
+  movdqa      [rsp+80h],xmm0 
+  movdqa      xmm0,[rsi+rbp] 
+  movdqa      [rsp+10h],xmm13 
+  punpckhbw   xmm0,xmm3 
+  movdqa      [rsp+50h],xmm0 
+  movdqa      xmm0,xmm1 
+  movdqa      xmm1,xmm13 
+  psubw       xmm0,xmm13 
+  psubw       xmm1,xmm2 
+  pabsw       xmm3,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,[rsp] 
+  movdqa      xmm13,[rsp+40h] 
+  movdqa      [rsp+110h],xmm2 
+  psubw       xmm1, [rsp+80h] 
+  pcmpgtw     xmm13,xmm0 
+  pcmpgtw     xmm11,xmm3 
+  pabsw       xmm0,xmm1 
+  pcmpgtw     xmm10,xmm3 
+  movdqa      xmm1, [rsp+40h] 
+  movdqa      xmm2,xmm1 
+  movdqa      xmm3,xmm1 
+  pcmpgtw     xmm2,xmm0 
+  movdqa      xmm0, [rsp+10h] 
+  pand        xmm13,xmm2 
+  pand        xmm13,xmm11 
+  movdqa      xmm11,[rsp+0C0h] 
+  psubw       xmm0,xmm11 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm3,xmm0 
+  pand        xmm3,xmm10 
+  movdqa      xmm0,[rsp] 
+  psubw       xmm0,[rsp+50h] 
+  movdqa      xmm2,[rdx] 
+  pabsw       xmm0,xmm0 
+  por         xmm7,xmm9 
+  movdqa      xmm9,[rsp+20h] 
+  pcmpgtw     xmm1,xmm0 
+  pand        xmm9,xmm7 
+  movdqa      xmm7,[rsp+20h] 
+  movdqa      xmm0,xmm7 
+  pandn       xmm0,xmm12 
+  movdqa      xmm12,[rsp+110h] 
+  pand        xmm1,xmm10 
+  movdqa      xmm10,[rsp+70h] 
+  movdqa      [rsp+40h],xmm1 
+  movdqa      xmm1,xmm13 
+  por         xmm9,xmm0 
+  pxor        xmm0,xmm0 
+  por         xmm4,xmm6 
+  movdqa      xmm6,xmm7 
+  punpckhbw   xmm2,xmm0 
+  por         xmm15,xmm5 
+  movdqa      xmm5,[rsp+20h] 
+  movdqa      xmm0,xmm3 
+  psllw       xmm2,1 
+  pandn       xmm0,xmm11 
+  pand        xmm6,xmm4 
+  movdqa      xmm4,[rsp] 
+  paddw       xmm2,xmm11 
+  pand        xmm5,xmm15 
+  movdqa      xmm15,[rsp+20h] 
+  paddw       xmm2,xmm11 
+  paddw       xmm2,xmm11 
+  paddw       xmm2,xmm12 
+  paddw       xmm2,[rsp+10h] 
+  paddw       xmm2,[rsp] 
+  paddw       xmm2,xmm10 
+  psraw       xmm2,3 
+  pand        xmm2,xmm3 
+  por         xmm2,xmm0 
+  pand        xmm1,xmm2 
+  movdqa      xmm0,xmm13 
+  movdqa      xmm2,xmm11 
+  pandn       xmm0,xmm11 
+  paddw       xmm2,xmm12 
+  por         xmm1,xmm0 
+  packuswb    xmm9,xmm1 
+  movdqa      xmm0,xmm7 
+  movdqa      xmm7,[rsp+0A0h] 
+  pandn       xmm0,[rsp+0F0h] 
+  movdqa      xmm1,xmm3 
+  por         xmm6,xmm0 
+  movdqa      xmm0,[rsp+10h] 
+  paddw       xmm0,xmm4 
+  paddw       xmm2,xmm0 
+  paddw       xmm2,xmm7 
+  movdqa      xmm0,xmm3 
+  pandn       xmm0,xmm12 
+  psraw       xmm2,2 
+  pand        xmm1,xmm2 
+  por         xmm1,xmm0 
+  movdqa      xmm2,xmm13 
+  movdqa      xmm0,xmm13 
+  pand        xmm2,xmm1 
+  pandn       xmm0,xmm12 
+  movdqa      xmm1,xmm12 
+  paddw       xmm1,[rsp+10h] 
+  por         xmm2,xmm0 
+  movdqa      xmm0,xmm15 
+  pandn       xmm0,[rsp+0B0h] 
+  paddw       xmm1,xmm4 
+  packuswb    xmm6,xmm2 
+  movdqa      xmm2,xmm3 
+  psllw       xmm1,1 
+  por         xmm5,xmm0 
+  movdqa      xmm0,[rsp+80h] 
+  paddw       xmm0,xmm10 
+  paddw       xmm1,xmm0 
+  paddw       xmm11,xmm1 
+  psraw       xmm11,3 
+  movdqa      xmm1,xmm12 
+  pand        xmm2,xmm11 
+  paddw       xmm1,xmm12 
+  movdqa      xmm11,[rsp+80h] 
+  movdqa      xmm0, [rsp+10h] 
+  por         xmm14,[rsp+0E0h] 
+  paddw       xmm0,xmm11 
+  movdqa      xmm4,xmm15 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm13 
+  paddw       xmm1,xmm7 
+  psraw       xmm1,2 
+  pandn       xmm3,xmm1 
+  por         xmm2,xmm3 
+  movdqa      xmm1,xmm13 
+  movdqa      xmm3,[rsp+10h] 
+  pandn       xmm0,xmm3 
+  pand        xmm1,xmm2 
+  movdqa      xmm2,xmm11 
+  paddw       xmm2,[rsp] 
+  por         xmm1,xmm0 
+  movdqa      xmm0,[rsp+0D0h] 
+  por         xmm0,xmm8 
+  paddw       xmm2,xmm3 
+  packuswb    xmm5,xmm1 
+  movdqa      xmm8,[rsp+40h] 
+  movdqa      xmm1,[rsp+50h] 
+  movdqa      xmm3,xmm8 
+  pand        xmm4,xmm0 
+  psllw       xmm2,1 
+  movdqa      xmm0,xmm15 
+  pandn       xmm0,[rsp+90h] 
+  por         xmm4,xmm0 
+  movdqa      xmm0,xmm12 
+  paddw       xmm0,xmm10 
+  paddw       xmm2,xmm0 
+  paddw       xmm1,xmm2 
+  movdqa      xmm0,[rsp] 
+  movdqa      xmm2,xmm11 
+  paddw       xmm0,xmm12 
+  movdqa      xmm12,[rsp] 
+  paddw       xmm2,xmm11 
+  paddw       xmm2,xmm0 
+  psraw       xmm1,3 
+  movdqa      xmm0,xmm8 
+  pand        xmm3,xmm1 
+  paddw       xmm2,xmm7 
+  movdqa      xmm1,xmm13 
+  psraw       xmm2,2 
+  pandn       xmm0,xmm2 
+  por         xmm3,xmm0 
+  movdqa      xmm2,[rsp+50h] 
+  movdqa      xmm0,xmm13 
+  pandn       xmm0,xmm12 
+  pand        xmm1,xmm3 
+  paddw       xmm2,xmm11 
+  movdqa      xmm3,xmm15 
+  por         xmm1,xmm0 
+  pand        xmm3,xmm14 
+  movdqa      xmm14,[rsp+10h] 
+  movdqa      xmm0,xmm15 
+  pandn       xmm0,[rsp+30h] 
+  packuswb    xmm4,xmm1 
+  movdqa      xmm1,xmm8 
+  por         xmm3,xmm0 
+  movdqa      xmm0,xmm12 
+  paddw       xmm0,xmm14 
+  paddw       xmm2,xmm0 
+  paddw       xmm2,xmm7 
+  movdqa      xmm0,xmm8 
+  pandn       xmm0,xmm11 
+  psraw       xmm2,2 
+  pand        xmm1,xmm2 
+  por         xmm1,xmm0 
+  movdqa      xmm2,xmm13 
+  movdqa      xmm0,xmm13 
+  pandn       xmm0,xmm11 
+  pand        xmm2,xmm1 
+  movdqa      xmm1,xmm15 
+  por         xmm2,xmm0 
+  packuswb    xmm3,xmm2 
+  movdqa      xmm0,[rsp+100h] 
+  por         xmm0,[rsp+120h] 
+  pand        xmm1,xmm0 
+  movdqa      xmm2,[rcx+rbp] 
+  movdqa      xmm7,[rsp+50h] 
+  pandn       xmm15,[rsp+60h] 
+  lea         r11,[rsp+1D8h] 
+  pxor        xmm0,xmm0 
+  por         xmm1,xmm15 
+  movaps      xmm15,[r11-0A8h] 
+  movdqa      [rdi],xmm9 
+  movaps      xmm9,[r11-48h] 
+  punpckhbw   xmm2,xmm0 
+  psllw       xmm2,1 
+  paddw       xmm2,xmm7 
+  paddw       xmm2,xmm7 
+  movdqa      [rbx],xmm6 
+  movaps      xmm6,[r11-18h] 
+  paddw       xmm2,xmm7 
+  paddw       xmm2,xmm11 
+  movaps      xmm11,[r11-68h] 
+  paddw       xmm2,xmm12 
+  movaps      xmm12,[r11-78h] 
+  paddw       xmm2,xmm14 
+  paddw       xmm2,xmm10 
+  psraw       xmm2,3 
+  movaps      xmm10,[r11-58h] 
+  movaps      xmm14,[r11-98h] 
+  movdqa      xmm0,xmm13 
+  pand        xmm2,xmm8 
+  pandn       xmm8,xmm7 
+  pandn       xmm13,xmm7 
+  por         xmm2,xmm8 
+  movaps      xmm7,[r11-28h] 
+  movaps      xmm8,[r11-38h] 
+  movdqa      [r8],xmm5 
+  pand        xmm0,xmm2 
+  por         xmm0,xmm13 
+  packuswb    xmm1,xmm0 
+  movaps      xmm13,[r11-88h] 
+  movdqa      [rbp],xmm4 
+  movdqa      [r10+rbp],xmm3 
+  movdqa      [rsi+rbp],xmm1 
+  mov         rsp,r11 
+  pop         rdi  
+  pop         rsi  
+  pop         rbp  
+  pop         rbx  
+  ret
+
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+ALIGN  16
+DeblockChromaLt4V_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  push        rdi     
+  sub         rsp,0C8h 
+  mov         r10,qword [rax + 30h]  ; pTC
+  pxor        xmm1,xmm1 
+  mov         rbx,rcx 
+  movsxd      r11,r8d 
+  movsx       ecx,byte [r10] 
+  movsx       r8d,byte [r10+2] 
+  mov         rdi,rdx 
+  movq        xmm2,[rbx] 
+  movq        xmm9,[r11+rbx] 
+  movsx       edx,byte [r10+1] 
+  mov         word [rsp+2],cx 
+  mov         word [rsp],cx 
+  movsx       eax,byte [r10+3] 
+  mov         word [rsp+6],dx 
+  mov         word [rsp+4],dx 
+  movdqa      xmm11,xmm1 
+  mov         word [rsp+0Eh],ax 
+  mov         word [rsp+0Ch],ax 
+  lea         eax,[r11+r11] 
+  movsxd      rcx,eax 
+  mov         rax,rbx 
+  mov         rdx,rdi 
+  sub         rax,rcx 
+  mov         word [rsp+0Ah],r8w 
+  mov         word [rsp+8],r8w 
+  movdqa      xmm6,[rsp] 
+  movdqa      xmm7,xmm6 
+  movq        xmm13, [rax] 
+  mov         rax,rdi 
+  sub         rax,rcx 
+  mov         rcx,rbx 
+  pcmpgtw     xmm7,xmm1 
+  psubw       xmm11,xmm6 
+  sub         rcx,r11 
+  sub         rdx,r11 
+  movq        xmm0,[rax] 
+  movsx       eax,r9w 
+  movq        xmm15,[rcx] 
+  punpcklqdq  xmm13,xmm0 
+  movq        xmm0, [rdx] 
+  movdqa      xmm4,xmm13 
+  punpcklqdq  xmm15,xmm0 
+  movq        xmm0, [rdi] 
+  punpcklbw   xmm4,xmm1 
+  movdqa      xmm12,xmm15 
+  punpcklqdq  xmm2,xmm0 
+  movq        xmm0, [r11+rdi] 
+  punpcklbw   xmm12,xmm1 
+  movdqa      xmm14,xmm2 
+  punpcklqdq  xmm9,xmm0 
+  punpckhbw   xmm2,xmm1 
+  punpcklbw   xmm14,xmm1 
+  movd        xmm0,eax 
+  movsx       eax,word [rsp + 0C8h + 38h] ; iBeta
+  punpckhbw   xmm13,xmm1 
+  punpckhbw   xmm15,xmm1 
+  movdqa      xmm3,xmm9 
+  movdqa      [rsp+10h],xmm2 
+  punpcklwd   xmm0,xmm0 
+  punpckhbw   xmm9,xmm1 
+  punpcklbw   xmm3,xmm1 
+  movdqa      xmm1,xmm14 
+  pshufd      xmm10,xmm0,0 
+  movd        xmm0,eax 
+  mov         eax,4 
+  cwde             
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm8,xmm0,0 
+  movd        xmm0,eax 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm5,xmm0,0 
+  psubw       xmm1,xmm12 
+  movdqa      xmm2,xmm10 
+  lea         r11,[rsp+0C8h] 
+  psllw       xmm1,2 
+  movdqa      xmm0,xmm4 
+  psubw       xmm4,xmm12 
+  psubw       xmm0,xmm3 
+  psubw       xmm3,xmm14 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm11 
+  psraw       xmm1,3 
+  pmaxsw      xmm0,xmm1 
+  pminsw      xmm6,xmm0 
+  movdqa      xmm1,xmm8 
+  movdqa      xmm0,xmm12 
+  psubw       xmm0,xmm14 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm2,xmm0 
+  pabsw       xmm0,xmm4 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm3 
+  movdqa      xmm3,[rsp] 
+  pand        xmm2,xmm1 
+  movdqa      xmm1,xmm8 
+  pcmpgtw     xmm1,xmm0 
+  movdqa      xmm0,xmm13 
+  pand        xmm2,xmm1 
+  psubw       xmm0,xmm9 
+  psubw       xmm13,xmm15 
+  pand        xmm2,xmm7 
+  pand        xmm6,xmm2 
+  paddw       xmm12,xmm6 
+  psubw       xmm14,xmm6 
+  movdqa      xmm2,[rsp+10h] 
+  movaps      xmm6,[r11-18h] 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm15 
+  psubw       xmm9,xmm2 
+  psllw       xmm1,2 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm15 
+  psubw       xmm0,xmm2 
+  psraw       xmm1,3 
+  pmaxsw      xmm11,xmm1 
+  pabsw       xmm0,xmm0 
+  movdqa      xmm1,xmm8 
+  pcmpgtw     xmm10,xmm0 
+  pabsw       xmm0,xmm13 
+  pminsw      xmm3,xmm11 
+  movaps      xmm11,[r11-68h] 
+  movaps      xmm13,[rsp+40h] 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm9 
+  movaps      xmm9, [r11-48h] 
+  pand        xmm10,xmm1 
+  pcmpgtw     xmm8,xmm0 
+  pand        xmm10,xmm8 
+  pand        xmm10,xmm7 
+  movaps      xmm8,[r11-38h] 
+  movaps      xmm7,[r11-28h] 
+  pand        xmm3,xmm10 
+  paddw       xmm15,xmm3 
+  psubw       xmm2,xmm3 
+  movaps      xmm10,[r11-58h] 
+  packuswb    xmm12,xmm15 
+  movaps      xmm15,[rsp+20h] 
+  packuswb    xmm14,xmm2 
+  movq        [rcx],xmm12 
+  movq        [rbx],xmm14 
+  psrldq      xmm12,8 
+  psrldq      xmm14,8 
+  movq        [rdx],xmm12 
+  movaps      xmm12,[r11-78h] 
+  movq        [rdi],xmm14 
+  movaps      xmm14,[rsp+30h] 
+  mov         rsp,r11 
+  pop         rdi  
+  pop         rbx  
+  ret
+
+
+WELS_EXTERN   DeblockChromaEq4V_sse2
+ALIGN 16
+DeblockChromaEq4V_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  sub         rsp,90h 
+  pxor        xmm1,xmm1 
+  mov         r11,rcx 
+  mov         rbx,rdx 
+  mov         r10d,r9d   
+  movq        xmm13,[r11] 
+  lea         eax,[r8+r8] 
+  movsxd      r9,eax 
+  mov         rax,rcx 
+  sub         rax,r9 
+  movq        xmm14,[rax] 
+  mov         rax,rdx 
+  sub         rax,r9 
+  movq        xmm0,[rax] 
+  movsxd      rax,r8d 
+  sub         rcx,rax 
+  sub         rdx,rax 
+  movq        xmm12,[rax+r11] 
+  movq        xmm10,[rcx] 
+  punpcklqdq  xmm14,xmm0 
+  movdqa      xmm8,xmm14 
+  movq        xmm0,[rdx] 
+  punpcklbw   xmm8,xmm1 
+  punpckhbw   xmm14,xmm1 
+  punpcklqdq  xmm10,xmm0 
+  movq        xmm0,[rbx] 
+  movdqa      xmm5,xmm10 
+  punpcklqdq  xmm13,xmm0 
+  movq        xmm0, [rax+rbx] 
+  punpcklbw   xmm5,xmm1 
+  movsx       eax,r10w 
+  movdqa      xmm9,xmm13 
+  punpcklqdq  xmm12,xmm0 
+  punpcklbw   xmm9,xmm1 
+  punpckhbw   xmm10,xmm1 
+  movd        xmm0,eax 
+  movsx       eax,word [rsp + 90h + 8h + 28h]   ; iBeta
+  punpckhbw   xmm13,xmm1 
+  movdqa      xmm7,xmm12 
+  punpcklwd   xmm0,xmm0 
+  punpckhbw   xmm12,xmm1 
+  pshufd      xmm11,xmm0,0 
+  punpcklbw   xmm7,xmm1 
+  movd        xmm0,eax 
+  movdqa      xmm1,xmm8 
+  psubw       xmm1,xmm5 
+  punpcklwd   xmm0,xmm0 
+  movdqa      xmm6,xmm11 
+  pshufd      xmm3,xmm0,0 
+  movdqa      xmm0,xmm5 
+  psubw       xmm0,xmm9 
+  movdqa      xmm2,xmm3 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm6,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm3 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm0,xmm7 
+  movdqa      xmm2,xmm3 
+  psubw       xmm0,xmm9 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm1,xmm0 
+  pand        xmm6,xmm1 
+  movdqa      xmm0,xmm10 
+  movdqa      xmm1,xmm14 
+  psubw       xmm0,xmm13 
+  psubw       xmm1,xmm10 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm11,xmm0 
+  pabsw       xmm0,xmm1 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm11,xmm2 
+  movdqa      xmm0,xmm12 
+  movdqa      xmm4,xmm6 
+  movdqa      xmm1,xmm8 
+  mov         eax,2 
+  cwde             
+  paddw       xmm1,xmm8 
+  psubw       xmm0,xmm13 
+  paddw       xmm1,xmm5 
+  pabsw       xmm0,xmm0 
+  movdqa      xmm2,xmm14 
+  paddw       xmm1,xmm7 
+  pcmpgtw     xmm3,xmm0 
+  paddw       xmm2,xmm14 
+  movd        xmm0,eax 
+  pand        xmm11,xmm3 
+  paddw       xmm7,xmm7 
+  paddw       xmm2,xmm10 
+  punpcklwd   xmm0,xmm0 
+  paddw       xmm2,xmm12 
+  paddw       xmm12,xmm12 
+  pshufd      xmm3,xmm0,0 
+  paddw       xmm7,xmm9 
+  paddw       xmm12,xmm13 
+  movdqa      xmm0,xmm6 
+  paddw       xmm1,xmm3 
+  pandn       xmm0,xmm5 
+  paddw       xmm7,xmm8 
+  psraw       xmm1,2 
+  paddw       xmm12,xmm14 
+  paddw       xmm7,xmm3 
+  movaps      xmm14,[rsp] 
+  pand        xmm4,xmm1 
+  paddw       xmm12,xmm3 
+  psraw       xmm7,2 
+  movdqa      xmm1,xmm11 
+  por         xmm4,xmm0 
+  psraw       xmm12,2 
+  paddw       xmm2,xmm3 
+  movdqa      xmm0,xmm11 
+  pandn       xmm0,xmm10 
+  psraw       xmm2,2 
+  pand        xmm1,xmm2 
+  por         xmm1,xmm0 
+  packuswb    xmm4,xmm1 
+  movdqa      xmm0,xmm11 
+  movdqa      xmm1,xmm6 
+  pand        xmm1,xmm7 
+  movaps      xmm7,[rsp+70h] 
+  movq        [rcx],xmm4 
+  pandn       xmm6,xmm9 
+  pandn       xmm11,xmm13 
+  pand        xmm0,xmm12 
+  por         xmm1,xmm6 
+  por         xmm0,xmm11 
+  psrldq      xmm4,8 
+  packuswb    xmm1,xmm0 
+  movq        [r11],xmm1 
+  psrldq      xmm1,8 
+  movq        [rdx],xmm4 
+  lea         r11,[rsp+90h] 
+  movaps      xmm6,[r11-10h] 
+  movaps      xmm8,[r11-30h] 
+  movaps      xmm9,[r11-40h] 
+  movq        [rbx],xmm1 
+  movaps      xmm10,[r11-50h] 
+  movaps      xmm11,[r11-60h] 
+  movaps      xmm12,[r11-70h] 
+  movaps      xmm13,[r11-80h] 
+  mov         rsp,r11 
+  pop         rbx  
+  ret
+
+
+
+
+
+WELS_EXTERN   DeblockChromaEq4H_sse2
+ALIGN  16
+DeblockChromaEq4H_sse2:
+  mov         rax,rsp 
+  mov         [rax+20h],rbx 
+  push        rdi  
+  sub         rsp,140h    
+  mov         rdi,rdx 
+  lea         eax,[r8*4] 
+  movsxd      r10,eax 
+  mov         eax,[rcx-2] 
+  mov         [rsp+10h],eax 
+  lea         rbx,[r10+rdx-2] 
+  lea         r11,[r10+rcx-2] 
+  movdqa      xmm5,[rsp+10h] 
+  movsxd      r10,r8d 
+  mov         eax,[r10+rcx-2] 
+  lea         rdx,[r10+r10*2] 
+  mov         [rsp+20h],eax 
+  mov         eax,[rcx+r10*2-2] 
+  mov         [rsp+30h],eax 
+  mov         eax,[rdx+rcx-2] 
+  movdqa      xmm2,[rsp+20h] 
+  mov         [rsp+40h],eax 
+  mov         eax, [rdi-2] 
+  movdqa      xmm4,[rsp+30h] 
+  mov         [rsp+50h],eax 
+  mov         eax,[r10+rdi-2] 
+  movdqa      xmm3,[rsp+40h] 
+  mov         [rsp+60h],eax 
+  mov         eax,[rdi+r10*2-2] 
+  punpckldq   xmm5,[rsp+50h] 
+  mov         [rsp+70h],eax 
+  mov         eax, [rdx+rdi-2] 
+  punpckldq   xmm2, [rsp+60h] 
+  mov          [rsp+80h],eax 
+  mov         eax,[r11] 
+  punpckldq   xmm4, [rsp+70h] 
+  mov         [rsp+50h],eax 
+  mov         eax,[rbx] 
+  punpckldq   xmm3,[rsp+80h] 
+  mov         [rsp+60h],eax 
+  mov         eax,[r10+r11] 
+  movdqa      xmm0, [rsp+50h] 
+  punpckldq   xmm0, [rsp+60h] 
+  punpcklqdq  xmm5,xmm0 
+  movdqa      [rsp+50h],xmm0 
+  mov         [rsp+50h],eax 
+  mov         eax,[r10+rbx] 
+  movdqa      xmm0,[rsp+50h] 
+  movdqa      xmm1,xmm5 
+  mov         [rsp+60h],eax 
+  mov         eax,[r11+r10*2] 
+  punpckldq   xmm0, [rsp+60h] 
+  punpcklqdq  xmm2,xmm0 
+  punpcklbw   xmm1,xmm2 
+  punpckhbw   xmm5,xmm2 
+  movdqa      [rsp+50h],xmm0 
+  mov         [rsp+50h],eax 
+  mov         eax,[rbx+r10*2] 
+  movdqa      xmm0,[rsp+50h] 
+  mov         [rsp+60h],eax 
+  mov         eax, [rdx+r11] 
+  movdqa      xmm15,xmm1 
+  punpckldq   xmm0,[rsp+60h] 
+  punpcklqdq  xmm4,xmm0 
+  movdqa      [rsp+50h],xmm0 
+  mov         [rsp+50h],eax 
+  mov         eax, [rdx+rbx] 
+  movdqa      xmm0,[rsp+50h] 
+  mov         [rsp+60h],eax 
+  punpckldq   xmm0, [rsp+60h] 
+  punpcklqdq  xmm3,xmm0 
+  movdqa      xmm0,xmm4 
+  punpcklbw   xmm0,xmm3 
+  punpckhbw   xmm4,xmm3 
+  punpcklwd   xmm15,xmm0 
+  punpckhwd   xmm1,xmm0 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm12,xmm15 
+  punpcklwd   xmm0,xmm4 
+  punpckhwd   xmm5,xmm4 
+  punpckldq   xmm12,xmm0 
+  punpckhdq   xmm15,xmm0 
+  movdqa      xmm0,xmm1 
+  movdqa      xmm11,xmm12 
+  punpckldq   xmm0,xmm5 
+  punpckhdq   xmm1,xmm5 
+  punpcklqdq  xmm11,xmm0 
+  punpckhqdq  xmm12,xmm0 
+  movsx       eax,r9w 
+  movdqa      xmm14,xmm15 
+  punpcklqdq  xmm14,xmm1 
+  punpckhqdq  xmm15,xmm1 
+  pxor        xmm1,xmm1 
+  movd        xmm0,eax 
+  movdqa      xmm4,xmm12 
+  movdqa      xmm8,xmm11 
+  movsx       eax,word [rsp+170h] ; iBeta
+  punpcklwd   xmm0,xmm0 
+  punpcklbw   xmm4,xmm1 
+  punpckhbw   xmm12,xmm1 
+  movdqa      xmm9,xmm14 
+  movdqa      xmm7,xmm15 
+  movdqa      xmm10,xmm15 
+  pshufd      xmm13,xmm0,0 
+  punpcklbw   xmm9,xmm1 
+  punpckhbw   xmm14,xmm1 
+  movdqa      xmm6,xmm13 
+  movd        xmm0,eax 
+  movdqa      [rsp],xmm11 
+  mov         eax,2 
+  cwde             
+  punpckhbw   xmm11,xmm1 
+  punpckhbw   xmm10,xmm1 
+  punpcklbw   xmm7,xmm1 
+  punpcklwd   xmm0,xmm0 
+  punpcklbw   xmm8,xmm1 
+  pshufd      xmm3,xmm0,0 
+  movdqa      xmm1,xmm8 
+  movdqa      xmm0,xmm4 
+  psubw       xmm0,xmm9 
+  psubw       xmm1,xmm4 
+  movdqa      xmm2,xmm3 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm6,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm3 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm0,xmm7 
+  movdqa      xmm2,xmm3 
+  psubw       xmm0,xmm9 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm1,xmm0 
+  pand        xmm6,xmm1 
+  movdqa      xmm0,xmm12 
+  movdqa      xmm1,xmm11 
+  psubw       xmm0,xmm14 
+  psubw       xmm1,xmm12 
+  movdqa      xmm5,xmm6 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm13,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm8 
+  pcmpgtw     xmm2,xmm0 
+  paddw       xmm1,xmm8 
+  movdqa      xmm0,xmm10 
+  pand        xmm13,xmm2 
+  psubw       xmm0,xmm14 
+  paddw       xmm1,xmm4 
+  movdqa      xmm2,xmm11 
+  pabsw       xmm0,xmm0 
+  paddw       xmm2,xmm11 
+  paddw       xmm1,xmm7 
+  pcmpgtw     xmm3,xmm0 
+  paddw       xmm2,xmm12 
+  movd        xmm0,eax 
+  pand        xmm13,xmm3 
+  paddw       xmm2,xmm10 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm3,xmm0,0 
+  movdqa      xmm0,xmm6 
+  paddw       xmm1,xmm3 
+  pandn       xmm0,xmm4 
+  paddw       xmm2,xmm3 
+  psraw       xmm1,2 
+  pand        xmm5,xmm1 
+  por         xmm5,xmm0 
+  paddw       xmm7,xmm7 
+  paddw       xmm10,xmm10 
+  psraw       xmm2,2 
+  movdqa      xmm1,xmm13 
+  movdqa      xmm0,xmm13 
+  pandn       xmm0,xmm12 
+  pand        xmm1,xmm2 
+  paddw       xmm7,xmm9 
+  por         xmm1,xmm0 
+  paddw       xmm10,xmm14 
+  paddw       xmm7,xmm8 
+  movdqa      xmm0,xmm13 
+  packuswb    xmm5,xmm1 
+  paddw       xmm7,xmm3 
+  paddw       xmm10,xmm11 
+  movdqa      xmm1,xmm6 
+  paddw       xmm10,xmm3 
+  pandn       xmm6,xmm9 
+  psraw       xmm7,2 
+  pand        xmm1,xmm7 
+  psraw       xmm10,2 
+  pandn       xmm13,xmm14 
+  pand        xmm0,xmm10 
+  por         xmm1,xmm6 
+  movdqa      xmm6,[rsp] 
+  movdqa      xmm4,xmm6 
+  por         xmm0,xmm13 
+  punpcklbw   xmm4,xmm5 
+  punpckhbw   xmm6,xmm5 
+  movdqa      xmm3,xmm4 
+  packuswb    xmm1,xmm0 
+  movdqa      xmm0,xmm1 
+  punpckhbw   xmm1,xmm15 
+  punpcklbw   xmm0,xmm15 
+  punpcklwd   xmm3,xmm0 
+  punpckhwd   xmm4,xmm0 
+  movdqa      xmm0,xmm6 
+  movdqa      xmm2,xmm3 
+  punpcklwd   xmm0,xmm1 
+  punpckhwd   xmm6,xmm1 
+  movdqa      xmm1,xmm4 
+  punpckldq   xmm2,xmm0 
+  punpckhdq   xmm3,xmm0 
+  punpckldq   xmm1,xmm6 
+  movdqa      xmm0,xmm2 
+  punpcklqdq  xmm0,xmm1 
+  punpckhdq   xmm4,xmm6 
+  punpckhqdq  xmm2,xmm1 
+  movdqa      [rsp+10h],xmm0 
+  movdqa      [rsp+60h],xmm2 
+  movdqa      xmm0,xmm3 
+  mov         eax,[rsp+10h] 
+  mov         [rcx-2],eax 
+  mov         eax,[rsp+60h] 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm3,xmm4 
+  mov         [r10+rcx-2],eax 
+  movdqa      [rsp+20h],xmm0 
+  mov         eax, [rsp+20h] 
+  movdqa      [rsp+70h],xmm3 
+  mov         [rcx+r10*2-2],eax 
+  mov         eax,[rsp+70h] 
+  mov         [rdx+rcx-2],eax 
+  mov         eax,[rsp+18h] 
+  mov         [r11],eax 
+  mov         eax,[rsp+68h] 
+  mov         [r10+r11],eax 
+  mov         eax,[rsp+28h] 
+  mov         [r11+r10*2],eax 
+  mov         eax,[rsp+78h] 
+  mov         [rdx+r11],eax 
+  mov         eax,[rsp+14h] 
+  mov         [rdi-2],eax 
+  mov         eax,[rsp+64h] 
+  mov         [r10+rdi-2],eax 
+  mov         eax,[rsp+24h] 
+  mov         [rdi+r10*2-2],eax 
+  mov         eax, [rsp+74h] 
+  mov         [rdx+rdi-2],eax 
+  mov         eax, [rsp+1Ch] 
+  mov         [rbx],eax 
+  mov         eax, [rsp+6Ch] 
+  mov         [r10+rbx],eax 
+  mov         eax,[rsp+2Ch] 
+  mov         [rbx+r10*2],eax 
+  mov         eax,[rsp+7Ch] 
+  mov         [rdx+rbx],eax  
+  lea         r11,[rsp+140h] 
+  mov         rbx, [r11+28h]    
+  mov         rsp,r11 
+  pop         rdi  
+  ret
+
+
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+ALIGN  16
+DeblockChromaLt4H_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  push        rbp  
+  push        rsi  
+  push        rdi  
+  push        r12  
+  sub         rsp,170h  
+  
+  movsxd      rsi,r8d 
+  lea         eax,[r8*4] 
+  mov         r11d,r9d 
+  movsxd      r10,eax 
+  mov         eax, [rcx-2] 
+  mov         r12,rdx 
+  mov         [rsp+40h],eax 
+  mov         eax, [rsi+rcx-2] 
+  lea         rbx,[r10+rcx-2] 
+  movdqa      xmm5,[rsp+40h] 
+  mov         [rsp+50h],eax 
+  mov         eax, [rcx+rsi*2-2] 
+  lea         rbp,[r10+rdx-2] 
+  movdqa      xmm2, [rsp+50h] 
+  mov         [rsp+60h],eax 
+  lea         r10,[rsi+rsi*2] 
+  mov         rdi,rcx 
+  mov         eax,[r10+rcx-2] 
+  movdqa      xmm4,[rsp+60h] 
+  mov         [rsp+70h],eax 
+  mov         eax,[rdx-2] 
+  mov         [rsp+80h],eax 
+  mov         eax, [rsi+rdx-2] 
+  movdqa      xmm3,[rsp+70h] 
+  mov         [rsp+90h],eax 
+  mov         eax,[rdx+rsi*2-2] 
+  punpckldq   xmm5,[rsp+80h] 
+  mov         [rsp+0A0h],eax 
+  mov         eax, [r10+rdx-2] 
+  punpckldq   xmm2,[rsp+90h] 
+  mov         [rsp+0B0h],eax 
+  mov         eax, [rbx] 
+  punpckldq   xmm4,[rsp+0A0h] 
+  mov         [rsp+80h],eax 
+  mov         eax,[rbp] 
+  punpckldq   xmm3,[rsp+0B0h] 
+  mov         [rsp+90h],eax 
+  mov         eax,[rsi+rbx] 
+  movdqa      xmm0,[rsp+80h] 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm5,xmm0 
+  movdqa      [rsp+80h],xmm0 
+  mov         [rsp+80h],eax 
+  mov         eax,[rsi+rbp] 
+  movdqa      xmm0,[rsp+80h] 
+  movdqa      xmm1,xmm5 
+  mov         [rsp+90h],eax 
+  mov         eax,[rbx+rsi*2] 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm2,xmm0 
+  punpcklbw   xmm1,xmm2 
+  punpckhbw   xmm5,xmm2 
+  movdqa      [rsp+80h],xmm0 
+  mov         [rsp+80h],eax 
+  mov         eax,[rbp+rsi*2] 
+  movdqa      xmm0, [rsp+80h] 
+  mov         [rsp+90h],eax 
+  mov         eax,[r10+rbx] 
+  movdqa      xmm7,xmm1 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm4,xmm0 
+  movdqa      [rsp+80h],xmm0 
+  mov         [rsp+80h],eax 
+  mov         eax, [r10+rbp] 
+  movdqa      xmm0,[rsp+80h] 
+  mov         [rsp+90h],eax 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm3,xmm0 
+  movdqa      xmm0,xmm4 
+  punpcklbw   xmm0,xmm3 
+  punpckhbw   xmm4,xmm3 
+  punpcklwd   xmm7,xmm0 
+  punpckhwd   xmm1,xmm0 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm6,xmm7 
+  punpcklwd   xmm0,xmm4 
+  punpckhwd   xmm5,xmm4 
+  punpckldq   xmm6,xmm0 
+  punpckhdq   xmm7,xmm0 
+  movdqa      xmm0,xmm1 
+  punpckldq   xmm0,xmm5 
+  mov         rax, [rsp+1C8h]    ; pTC
+  punpckhdq   xmm1,xmm5 
+  movdqa      xmm9,xmm6 
+  punpckhqdq  xmm6,xmm0 
+  punpcklqdq  xmm9,xmm0 
+  movdqa      xmm2,xmm7 
+  movdqa      xmm13,xmm6 
+  movdqa      xmm4,xmm9 
+  movdqa      [rsp+10h],xmm9 
+  punpcklqdq  xmm2,xmm1 
+  punpckhqdq  xmm7,xmm1 
+  pxor        xmm1,xmm1 
+  movsx       ecx,byte [rax+3] 
+  movsx       edx,byte [rax+2] 
+  movsx       r8d,byte [rax+1] 
+  movsx       r9d,byte [rax] 
+  movdqa      xmm10,xmm1 
+  movdqa      xmm15,xmm2 
+  punpckhbw   xmm2,xmm1 
+  punpckhbw   xmm6,xmm1 
+  punpcklbw   xmm4,xmm1 
+  movsx       eax,r11w 
+  mov         word [rsp+0Eh],cx 
+  mov         word [rsp+0Ch],cx 
+  movdqa      xmm3,xmm7 
+  movdqa      xmm8,xmm7 
+  movdqa      [rsp+20h],xmm7 
+  punpcklbw   xmm15,xmm1 
+  punpcklbw   xmm13,xmm1 
+  punpcklbw   xmm3,xmm1 
+  mov         word [rsp+0Ah],dx 
+  mov         word [rsp+8],dx 
+  mov         word [rsp+6],r8w 
+  movd        xmm0,eax 
+  movdqa      [rsp+30h],xmm6 
+  punpckhbw   xmm9,xmm1 
+  punpckhbw   xmm8,xmm1 
+  punpcklwd   xmm0,xmm0 
+  movsx       eax,word [rsp+1C0h]   ; iBeta
+  mov         word [rsp+4],r8w 
+  mov         word [rsp+2],r9w 
+  pshufd      xmm12,xmm0,0 
+  mov         word [rsp],r9w 
+  movd        xmm0,eax 
+  mov         eax,4 
+  cwde             
+  movdqa      xmm14, [rsp] 
+  movdqa      [rsp],xmm2 
+  movdqa      xmm2,xmm12 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm11,xmm0,0 
+  psubw       xmm10,xmm14 
+  movd        xmm0,eax 
+  movdqa      xmm7,xmm14 
+  movdqa      xmm6,xmm14 
+  pcmpgtw     xmm7,xmm1 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm5,xmm0,0 
+  movdqa      xmm0,xmm4 
+  movdqa      xmm1,xmm15 
+  psubw       xmm4,xmm13 
+  psubw       xmm0,xmm3 
+  psubw       xmm1,xmm13 
+  psubw       xmm3,xmm15 
+  psllw       xmm1,2 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm10 
+  psraw       xmm1,3 
+  pmaxsw      xmm0,xmm1 
+  pminsw      xmm6,xmm0 
+  movdqa      xmm1,xmm11 
+  movdqa      xmm0,xmm13 
+  psubw       xmm0,xmm15 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm2,xmm0 
+  pabsw       xmm0,xmm4 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm3 
+  pand        xmm2,xmm1 
+  movdqa      xmm1,xmm11 
+  movdqa      xmm3,[rsp+30h] 
+  pcmpgtw     xmm1,xmm0 
+  movdqa      xmm0,xmm9 
+  pand        xmm2,xmm1 
+  psubw       xmm0,xmm8 
+  psubw       xmm9,xmm3 
+  pand        xmm2,xmm7 
+  pand        xmm6,xmm2 
+  psubw       xmm15,xmm6 
+  paddw       xmm13,xmm6 
+  movdqa      xmm2,[rsp] 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  psubw       xmm8,xmm2 
+  psllw       xmm1,2 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm3 
+  movdqa      xmm5,[rsp+10h] 
+  psubw       xmm0,xmm2 
+  psraw       xmm1,3 
+  movdqa      xmm4,xmm5 
+  pabsw       xmm0,xmm0 
+  pmaxsw      xmm10,xmm1 
+  movdqa      xmm1,xmm11 
+  pcmpgtw     xmm12,xmm0 
+  pabsw       xmm0,xmm9 
+  pminsw      xmm14,xmm10 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm8 
+  pcmpgtw     xmm11,xmm0 
+  pand        xmm12,xmm1 
+  movdqa      xmm1,[rsp+20h] 
+  pand        xmm12,xmm11 
+  pand        xmm12,xmm7 
+  pand        xmm14,xmm12 
+  paddw       xmm3,xmm14 
+  psubw       xmm2,xmm14 
+  packuswb    xmm13,xmm3 
+  packuswb    xmm15,xmm2 
+  punpcklbw   xmm4,xmm13 
+  punpckhbw   xmm5,xmm13 
+  movdqa      xmm0,xmm15 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm15,xmm1 
+  movdqa      xmm3,xmm4 
+  punpcklwd   xmm3,xmm0 
+  punpckhwd   xmm4,xmm0 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm2,xmm3 
+  movdqa      xmm1,xmm4 
+  punpcklwd   xmm0,xmm15 
+  punpckhwd   xmm5,xmm15 
+  punpckldq   xmm2,xmm0 
+  punpckhdq   xmm3,xmm0 
+  punpckldq   xmm1,xmm5 
+  movdqa      xmm0,xmm2 
+  punpcklqdq  xmm0,xmm1 
+  punpckhdq   xmm4,xmm5 
+  punpckhqdq  xmm2,xmm1 
+  movdqa      [rsp+40h],xmm0 
+  movdqa      xmm0,xmm3 
+  movdqa      [rsp+90h],xmm2 
+  mov         eax,[rsp+40h] 
+  mov         [rdi-2],eax 
+  mov         eax, [rsp+90h] 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm3,xmm4 
+  mov         [rsi+rdi-2],eax 
+  movdqa      [rsp+50h],xmm0 
+  mov         eax,[rsp+50h] 
+  movdqa      [rsp+0A0h],xmm3 
+  mov         [rdi+rsi*2-2],eax 
+  mov         eax,[rsp+0A0h] 
+  mov         [r10+rdi-2],eax 
+  mov         eax,[rsp+48h] 
+  mov         [rbx],eax 
+  mov         eax,[rsp+98h] 
+  mov         [rsi+rbx],eax 
+  mov         eax,[rsp+58h] 
+  mov         [rbx+rsi*2],eax 
+  mov         eax, [rsp+0A8h] 
+  mov         [r10+rbx],eax 
+  mov         eax, [rsp+44h] 
+  mov         [r12-2],eax 
+  mov         eax,[rsp+94h] 
+  mov         [rsi+r12-2],eax 
+  mov         eax,[rsp+54h] 
+  mov         [r12+rsi*2-2],eax 
+  mov         eax, [rsp+0A4h] 
+  mov         [r10+r12-2],eax 
+  mov         eax,[rsp+4Ch] 
+  mov         [rbp],eax 
+  mov         eax,[rsp+9Ch] 
+  mov         [rsi+rbp],eax 
+  mov         eax, [rsp+5Ch] 
+  mov         [rbp+rsi*2],eax 
+  mov         eax,[rsp+0ACh] 
+  mov         [r10+rbp],eax   
+  lea         r11,[rsp+170h]    
+  mov         rsp,r11 
+  pop         r12  
+  pop         rdi  
+  pop         rsi  
+  pop         rbp  
+  pop         rbx  
+  ret 
+
+
+
+%elifdef  UNIX64
+
+
+WELS_EXTERN   DeblockLumaLt4V_sse2
+
+DeblockLumaLt4V_sse2:
+  push        rbp      
+  mov         r11,r8  ; pTC                                                    
+  sub         rsp,1B0h                                                       
+  lea         rbp,[rsp+20h]                                                  
+  movd        xmm4,edx                                                                                                  
+  movd        xmm2,ecx                                                       
+  mov         qword [rbp+180h],r12                                       
+  mov         r10,rdi                                                        
+  movsxd      r12,esi                                                        
+  add         rsi,rsi
+  movsxd      rdx,esi 
+  sub         r10,r12                                                        
+  movsx       r8d,byte [r11]                                             
+  pxor        xmm3,xmm3                                                      
+  punpcklwd   xmm2,xmm2                                                      
+  movaps      [rbp+50h],xmm14                                    
+  lea         rax,[r12+r12*2]                                                
+  movdqa      xmm14,[rdx+rdi]                                    
+  neg         rax                                                            
+  pshufd      xmm0,xmm2,0                                                    
+  movd        xmm2,r8d                                                       
+  movsx       rsi,byte [r11+1]                                           
+  movsx       r8d,byte [r11+2]                                           
+  movsx       r11d,byte [r11+3]                                          
+  movaps      [rbp+70h],xmm12                                    
+  movd        xmm1,esi                                                      
+  movaps      [rbp+80h],xmm11                                    
+  movd        xmm12,r8d                                                      
+  movd        xmm11,r11d                                                     
+  movdqa      xmm5, [rax+rdi]                                     
+  lea         rax,[r12+r12]                                                  
+  punpcklwd   xmm12,xmm12                                                    
+  neg         rax                                                            
+  punpcklwd   xmm11,xmm11                                                    
+  movaps      [rbp],xmm8                                         
+  movdqa      xmm8, [r10]                                         
+  punpcklwd   xmm2,xmm2                                                      
+  punpcklwd   xmm1,xmm1                                                      
+  punpcklqdq  xmm12,xmm12                                                    
+  punpcklqdq  xmm11,xmm11                                                    
+  punpcklqdq  xmm2,xmm2                                                      
+  punpcklqdq  xmm1,xmm1                                                      
+  shufps      xmm12,xmm11,88h                                                
+  movdqa      xmm11,xmm8                                                     
+  movaps      [rbp+30h],xmm9                                     
+  movdqa      xmm9,[rdi]                                         
+  shufps      xmm2,xmm1,88h                                                  
+  movdqa      xmm1,xmm5                                                      
+  punpcklbw   xmm11,xmm3                                                     
+  movaps      [rbp+20h],xmm6                                     
+  movaps      [rbp+60h],xmm13                                    
+  movdqa      xmm13,xmm11                                                    
+  movaps      [rbp+90h],xmm10                                    
+  movdqa      xmm10,xmm9                                                     
+  movdqa      xmm6,[rax+rdi]                                     
+  punpcklbw   xmm1,xmm3                                                      
+  movaps      [rbp+0A0h],xmm12                                   
+  psubw       xmm13,xmm1                                                     
+  movaps      [rbp+40h],xmm15                                    
+  movdqa      xmm15,xmm14                                                    
+  movaps      [rbp+10h],xmm7                                     
+  movdqa      xmm7,xmm6                                                      
+  punpcklbw   xmm10,xmm3                                                     
+  movdqa      xmm12,[r12+rdi]                                    
+  punpcklbw   xmm7,xmm3                                                      
+  punpcklbw   xmm12,xmm3                                                     
+  punpcklbw   xmm15,xmm3                                                     
+  pabsw       xmm3,xmm13                                                     
+  movdqa      xmm13,xmm10                                                    
+  psubw       xmm13,xmm15                                                    
+  movdqa      [rbp+0F0h],xmm15                                   
+  pabsw       xmm15,xmm13                                                    
+  movdqa      xmm13,xmm11                                                    
+  movdqa      [rbp+0B0h],xmm1                                    
+  movdqa      xmm1,xmm0                                                      
+  pavgw       xmm13,xmm10                                                    
+  pcmpgtw     xmm1,xmm3                                                      
+  movdqa      [rbp+120h],xmm13                                   
+  movaps      xmm13,xmm2                                                     
+  punpcklwd   xmm4,xmm4                                                      
+  movdqa      xmm3,xmm0                                                      
+  movdqa      [rbp+100h],xmm1                                    
+  psubw       xmm13,xmm1                                                     
+  movdqa      xmm1,xmm10                                                     
+  pcmpgtw     xmm3,xmm15                                                     
+  pshufd      xmm4,xmm4,0                                                    
+  psubw       xmm1,xmm11                                                     
+  movdqa      [rbp+0D0h],xmm10                                   
+  psubw       xmm13,xmm3                                                     
+  movdqa      [rbp+110h],xmm3                                    
+  pabsw       xmm15,xmm1                                                     
+  movdqa      xmm3,xmm4                                                      
+  psubw       xmm10,xmm12                                                    
+  pcmpgtw     xmm3,xmm15                                                     
+  pabsw       xmm15,xmm10                                                    
+  movdqa      xmm10,xmm0                                                     
+  psllw       xmm1,2                                                         
+  movdqa      [rbp+0C0h],xmm11                                   
+  psubw       xmm11,xmm7                                                     
+  pcmpgtw     xmm10,xmm15                                                    
+  pabsw       xmm11,xmm11                                                    
+  movdqa      xmm15,xmm0                                                     
+  pand        xmm3,xmm10                                                     
+  pcmpgtw     xmm15,xmm11                                                    
+  movaps      xmm11,xmm2                                                     
+  pxor        xmm10,xmm10                                                    
+  pand        xmm3,xmm15                                                     
+  pcmpgtw     xmm11,xmm10                                                    
+  pcmpeqw     xmm10,xmm2                                                     
+  por         xmm11,xmm10                                                    
+  pand        xmm3,xmm11                                                     
+  movdqa      xmm11,xmm7                                                     
+  psubw       xmm11,xmm12                                                    
+  pxor        xmm15,xmm15                                                    
+  paddw       xmm11,xmm1                                                     
+  psubw       xmm15,xmm13                                                    
+  movdqa      [rbp+0E0h],xmm12                                   
+  paddw       xmm11,[FOUR_16B_SSE2] 
+  pxor        xmm12,xmm12                                                    
+  psraw       xmm11,3                                                        
+  punpckhbw   xmm8,xmm12                                                     
+  pmaxsw      xmm15,xmm11                                                    
+  punpckhbw   xmm5,xmm12                                                     
+  movdqa      xmm11,xmm8                                                     
+  pminsw      xmm13,xmm15                                                    
+  psubw       xmm11,xmm5                                                     
+  punpckhbw   xmm9,xmm12                                                     
+  pand        xmm13,xmm3                                                     
+  movdqa      [rbp+130h],xmm13                                   
+  pabsw       xmm13,xmm11                                                    
+  punpckhbw   xmm14,xmm12                                                    
+  movdqa      xmm11,xmm9                                                     
+  psubw       xmm11,xmm14                                                    
+  movdqa      xmm15,xmm0                                                     
+  movdqa      [rbp+140h],xmm14                                   
+  pabsw       xmm14,xmm11                                                    
+  movdqa      xmm11,xmm8                                                     
+  pcmpgtw     xmm15,xmm14                                                    
+  movdqa      xmm1,[r12+rdi]                                     
+  pavgw       xmm11,xmm9                                                     
+  movdqa      [rbp+170h],xmm11                                   
+  movdqa      xmm10,xmm9                                                     
+  punpckhbw   xmm6,xmm12                                                     
+  psubw       xmm10,xmm8                                                     
+  punpckhbw   xmm1,xmm12                                                     
+  movdqa      xmm12,xmm0                                                     
+  movaps      xmm11,[rbp+0A0h]                                   
+  pcmpgtw     xmm12,xmm13                                                    
+  movaps      xmm13,xmm11                                                    
+  psubw       xmm13,xmm12                                                    
+  movdqa      [rbp+160h],xmm15                                   
+  psubw       xmm13,xmm15                                                    
+  movdqa      xmm15,xmm9                                                     
+  psubw       xmm15,xmm1                                                     
+  movdqa      [rbp+150h],xmm12                                   
+  pabsw       xmm12,xmm10                                                    
+  pabsw       xmm14,xmm15                                                    
+  movdqa      xmm15,xmm8                                                     
+  pcmpgtw     xmm4,xmm12                                                     
+  movdqa      xmm12,xmm0                                                     
+  psubw       xmm15,xmm6                                                     
+  pcmpgtw     xmm12,xmm14                                                    
+  pabsw       xmm14,xmm15                                                    
+  psllw       xmm10,2                                                        
+  pcmpgtw     xmm0,xmm14                                                     
+  movdqa      xmm14,xmm6                                                     
+  psubw       xmm14,xmm1                                                     
+  pand        xmm4,xmm12                                                     
+  paddw       xmm14,xmm10                                                    
+  pand        xmm4,xmm0                                                      
+  paddw       xmm14,[FOUR_16B_SSE2] 
+  pxor        xmm15,xmm15                                                    
+  movaps      xmm12,xmm11                                                    
+  psubw       xmm15,xmm13                                                    
+  pxor        xmm0,xmm0                                                      
+  psraw       xmm14,3                                                        
+  pcmpgtw     xmm12,xmm0                                                     
+  pcmpeqw     xmm0,xmm11                                                     
+  pmaxsw      xmm15,xmm14                                                    
+  por         xmm12,xmm0                                                     
+  movdqa      xmm0,[rbp+120h]                                    
+  pminsw      xmm13,xmm15                                                    
+  movdqa      xmm15,[rbp+0B0h]                                   
+  movdqa      xmm10,xmm7                                                     
+  pand        xmm4,xmm12                                                     
+  paddw       xmm15,xmm0                                                     
+  pxor        xmm12,xmm12                                                    
+  paddw       xmm10,xmm7                                                     
+  movdqa      xmm14,xmm12                                                    
+  psubw       xmm15,xmm10                                                    
+  psubw       xmm14,xmm2                                                     
+  psraw       xmm15,1                                                        
+  pmaxsw      xmm15,xmm14                                                    
+  movdqa      xmm10,xmm6                                                     
+  pminsw      xmm15,xmm2                                                     
+  paddw       xmm10,xmm6                                                     
+  pand        xmm15,xmm3                                                     
+  psubw       xmm12,xmm11                                                    
+  pand        xmm15,[rbp+100h]                                   
+  pand        xmm13,xmm4                                                     
+  paddw       xmm7,xmm15                                                     
+  paddw       xmm8,xmm13                                                     
+  movdqa      xmm15,[rbp+170h]                                   
+  psubw       xmm9,xmm13                                                     
+  paddw       xmm5,xmm15                                                     
+  psubw       xmm5,xmm10                                                     
+  psraw       xmm5,1                                                         
+  pmaxsw      xmm5,xmm12                                                     
+  pminsw      xmm5,xmm11                                                     
+  pand        xmm5,xmm4                                                      
+  pand        xmm5,[rbp+150h]                                    
+  paddw       xmm6,xmm5                                                      
+  movdqa      xmm5,[rbp+0C0h]                                    
+  packuswb    xmm7,xmm6                                                      
+  movdqa      xmm6,[rbp+130h]                                    
+  paddw       xmm5,xmm6                                                      
+  packuswb    xmm5,xmm8                                                      
+  movdqa      xmm8,[rbp+0D0h]                                    
+  psubw       xmm8,xmm6                                                      
+  movdqa      xmm6,[rbp+0F0h]                                    
+  paddw       xmm6,xmm0                                                      
+  movdqa      xmm0,[rbp+0E0h]                                    
+  packuswb    xmm8,xmm9                                                      
+  movdqa      xmm9,xmm0                                                      
+  paddw       xmm9,xmm0                                                      
+  psubw       xmm6,xmm9                                                      
+  psraw       xmm6,1                                                         
+  pmaxsw      xmm14,xmm6                                                     
+  pminsw      xmm2,xmm14                                                     
+  pand        xmm2,xmm3                                                      
+  pand        xmm2,[rbp+110h]                                    
+  paddw       xmm0,xmm2                                                      
+  movdqa      xmm2,[rbp+140h]                                    
+  paddw       xmm2,xmm15                                                     
+  movdqa      xmm15,xmm1                                                     
+  paddw       xmm15,xmm1                                                     
+  psubw       xmm2,xmm15                                                     
+  psraw       xmm2,1                                                         
+  pmaxsw      xmm12,xmm2                                                     
+  pminsw      xmm11,xmm12                                                    
+  pand        xmm11,xmm4                                                     
+  pand        xmm11,[rbp+160h]                                   
+  paddw       xmm1,xmm11                                                     
+  movdqa      [rax+rdi],xmm7                                     
+  movdqa      [r10],xmm5                                         
+  packuswb    xmm0,xmm1                                                      
+  movdqa      [rdi],xmm8                                         
+  movdqa      [r12+rdi],xmm0                                                                        
+  mov         r12,qword [rbp+180h]                                       
+  lea         rsp,[rbp+190h]                                                 
+  pop         rbp                                                            
+  ret 
+
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN  16
+DeblockLumaEq4V_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  push        rbp   
+  mov         r8,   rdx
+  mov         r9,   rcx
+  mov         rcx,  rdi
+  mov         rdx,  rsi
+  sub         rsp,1D8h 
+  movaps      [rax-38h],xmm6 
+  movaps      [rax-48h],xmm7 
+  movaps      [rax-58h],xmm8 
+  pxor        xmm1,xmm1 
+  movsxd      r10,edx 
+  mov         rbp,rcx 
+  mov         r11d,r8d 
+  mov         rdx,rcx 
+  mov         rdi,rbp 
+  mov         rbx,rbp 
+  movdqa      xmm5,[rbp] 
+  movaps      [rax-68h],xmm9 
+  movaps      [rax-78h],xmm10 
+  punpcklbw   xmm5,xmm1 
+  movaps      [rax-88h],xmm11 
+  movaps      [rax-98h],xmm12 
+  movaps      [rax-0A8h],xmm13 
+  movaps      [rax-0B8h],xmm14 
+  movdqa      xmm14,[r10+rbp] 
+  movaps      [rax-0C8h],xmm15 
+  lea         eax,[r10*4] 
+  movsxd      r8,eax 
+  lea         eax,[r10+r10*2] 
+  movsxd      rcx,eax 
+  lea         eax,[r10+r10] 
+  sub         rdx,r8 
+  punpcklbw   xmm14,xmm1 
+  movdqa      [rsp+90h],xmm5 
+  movdqa      [rsp+30h],xmm14 
+  movsxd      rsi,eax 
+  movsx       eax,r11w 
+  sub         rdi,rcx 
+  sub         rbx,rsi 
+  mov         r8,rbp 
+  sub         r8,r10 
+  movd        xmm0,eax 
+  movsx       eax,r9w 
+  movdqa      xmm12,[rdi] 
+  movdqa      xmm6, [rsi+rbp] 
+  movdqa      xmm13,[rbx] 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm11,xmm0,0 
+  punpcklbw   xmm13,xmm1 
+  punpcklbw   xmm6,xmm1 
+  movdqa      xmm8,[r8] 
+  movd        xmm0,eax 
+  movdqa      xmm10,xmm11 
+  mov         eax,2 
+  punpcklbw   xmm8,xmm1 
+  punpcklbw   xmm12,xmm1 
+  cwde             
+  punpcklwd   xmm0,xmm0 
+  psraw       xmm10,2 
+  movdqa      xmm1,xmm8 
+  movdqa      [rsp+0F0h],xmm13 
+  movdqa      [rsp+0B0h],xmm8 
+  pshufd      xmm7,xmm0,0 
+  psubw       xmm1,xmm13 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm4,xmm7 
+  movdqa      xmm2,xmm7 
+  psubw       xmm0,xmm8 
+  pabsw       xmm3,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm5 
+  movdqa      [rsp+40h],xmm7 
+  movdqa      [rsp+60h],xmm6 
+  pcmpgtw     xmm4,xmm0 
+  psubw       xmm1,xmm14 
+  pabsw       xmm0,xmm1 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm4,xmm2 
+  movdqa      xmm0,xmm11 
+  pcmpgtw     xmm0,xmm3 
+  pand        xmm4,xmm0 
+  movd        xmm0,eax 
+  movdqa      [rsp+20h],xmm4 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm2,xmm0,0 
+  paddw       xmm10,xmm2 
+  movdqa      [rsp+0A0h],xmm2 
+  movdqa      xmm15,xmm7 
+  pxor        xmm4,xmm4 
+  movdqa      xmm0,xmm8 
+  psubw       xmm0,xmm12 
+  mov         eax,4 
+  pabsw       xmm0,xmm0 
+  movdqa      xmm1,xmm10 
+  cwde             
+  pcmpgtw     xmm15,xmm0 
+  pcmpgtw     xmm1,xmm3 
+  movdqa      xmm3,xmm7 
+  movdqa      xmm7,[rdx] 
+  movdqa      xmm0,xmm5 
+  psubw       xmm0,xmm6 
+  pand        xmm15,xmm1 
+  punpcklbw   xmm7,xmm4 
+  movdqa      xmm9,xmm15 
+  pabsw       xmm0,xmm0 
+  psllw       xmm7,1 
+  pandn       xmm9,xmm12 
+  pcmpgtw     xmm3,xmm0 
+  paddw       xmm7,xmm12 
+  movd        xmm0,eax 
+  pand        xmm3,xmm1 
+  paddw       xmm7,xmm12 
+  punpcklwd   xmm0,xmm0 
+  paddw       xmm7,xmm12 
+  pshufd      xmm1,xmm0,0 
+  paddw       xmm7,xmm13 
+  movdqa      xmm0,xmm3 
+  pandn       xmm0,xmm6 
+  paddw       xmm7,xmm8 
+  movdqa      [rsp+70h],xmm1 
+  paddw       xmm7,xmm5 
+  movdqa      [rsp+120h],xmm0 
+  movdqa      xmm0,[rcx+rbp] 
+  punpcklbw   xmm0,xmm4 
+  paddw       xmm7,xmm1 
+  movdqa      xmm4,xmm15 
+  psllw       xmm0,1 
+  psraw       xmm7,3 
+  paddw       xmm0,xmm6 
+  pand        xmm7,xmm15 
+  paddw       xmm0,xmm6 
+  paddw       xmm0,xmm6 
+  paddw       xmm0,xmm14 
+  movdqa      xmm6,xmm15 
+  paddw       xmm0,xmm5 
+  pandn       xmm6,xmm13 
+  paddw       xmm0,xmm8 
+  paddw       xmm0,xmm1 
+  psraw       xmm0,3 
+  movdqa      xmm1,xmm12 
+  paddw       xmm1,xmm13 
+  pand        xmm0,xmm3 
+  movdqa      [rsp+100h],xmm0 
+  movdqa      xmm0,xmm8 
+  paddw       xmm0,xmm5 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm3 
+  paddw       xmm1,xmm2 
+  psraw       xmm1,2 
+  pandn       xmm0,xmm14 
+  pand        xmm4,xmm1 
+  movdqa      [rsp+0E0h],xmm0 
+  movdqa      xmm0,xmm5 
+  paddw       xmm0,xmm8 
+  movdqa      xmm1,[rsp+60h] 
+  paddw       xmm1,xmm14 
+  movdqa      xmm14,xmm3 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm8 
+  paddw       xmm0,[rsp+30h] 
+  paddw       xmm1,xmm2 
+  psraw       xmm1,2 
+  pand        xmm14,xmm1 
+  movdqa      xmm1,xmm13 
+  paddw       xmm1,xmm13 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm2 
+  psraw       xmm1,2 
+  movdqa      xmm0,[rsp+30h] 
+  movdqa      xmm2,xmm13 
+  movdqa      xmm5,xmm15 
+  paddw       xmm0,[rsp+70h] 
+  pandn       xmm5,xmm1 
+  paddw       xmm2,xmm8 
+  movdqa      xmm8,[rsp+90h] 
+  movdqa      xmm1,xmm12 
+  paddw       xmm2,xmm8 
+  psllw       xmm2,1 
+  paddw       xmm2,xmm0 
+  paddw       xmm1,xmm2 
+  movdqa      xmm0,xmm8 
+  movdqa      xmm8,xmm3 
+  movdqa      xmm2,[rsp+30h] 
+  paddw       xmm0,xmm13 
+  psraw       xmm1,3 
+  pand        xmm15,xmm1 
+  movdqa      xmm1,xmm2 
+  paddw       xmm1,xmm2 
+  paddw       xmm2,[rsp+90h] 
+  paddw       xmm2,[rsp+0B0h] 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm13 
+  movdqa      xmm13,[r8] 
+  paddw       xmm0, [rsp+70h] 
+  paddw       xmm1, [rsp+0A0h] 
+  psllw       xmm2,1 
+  paddw       xmm2,xmm0 
+  psraw       xmm1,2 
+  movdqa      xmm0, [rdi] 
+  pandn       xmm8,xmm1 
+  movdqa      xmm1, [rsp+60h] 
+  paddw       xmm1,xmm2 
+  movdqa      xmm2, [rbx] 
+  psraw       xmm1,3 
+  pand        xmm3,xmm1 
+  movdqa      xmm1, [rbp] 
+  movdqa      [rsp+0D0h],xmm3 
+  pxor        xmm3,xmm3 
+  punpckhbw   xmm0,xmm3 
+  punpckhbw   xmm1,xmm3 
+  punpckhbw   xmm13,xmm3 
+  movdqa      [rsp+0C0h],xmm0 
+  movdqa      xmm0,[r10+rbp] 
+  movdqa      [rsp],xmm1 
+  punpckhbw   xmm0,xmm3 
+  punpckhbw   xmm2,xmm3 
+  movdqa      [rsp+80h],xmm0 
+  movdqa      xmm0,[rsi+rbp] 
+  movdqa      [rsp+10h],xmm13 
+  punpckhbw   xmm0,xmm3 
+  movdqa      [rsp+50h],xmm0 
+  movdqa      xmm0,xmm1 
+  movdqa      xmm1,xmm13 
+  psubw       xmm0,xmm13 
+  psubw       xmm1,xmm2 
+  pabsw       xmm3,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,[rsp] 
+  movdqa      xmm13,[rsp+40h] 
+  movdqa      [rsp+110h],xmm2 
+  psubw       xmm1, [rsp+80h] 
+  pcmpgtw     xmm13,xmm0 
+  pcmpgtw     xmm11,xmm3 
+  pabsw       xmm0,xmm1 
+  pcmpgtw     xmm10,xmm3 
+  movdqa      xmm1, [rsp+40h] 
+  movdqa      xmm2,xmm1 
+  movdqa      xmm3,xmm1 
+  pcmpgtw     xmm2,xmm0 
+  movdqa      xmm0, [rsp+10h] 
+  pand        xmm13,xmm2 
+  pand        xmm13,xmm11 
+  movdqa      xmm11,[rsp+0C0h] 
+  psubw       xmm0,xmm11 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm3,xmm0 
+  pand        xmm3,xmm10 
+  movdqa      xmm0,[rsp] 
+  psubw       xmm0,[rsp+50h] 
+  movdqa      xmm2,[rdx] 
+  pabsw       xmm0,xmm0 
+  por         xmm7,xmm9 
+  movdqa      xmm9,[rsp+20h] 
+  pcmpgtw     xmm1,xmm0 
+  pand        xmm9,xmm7 
+  movdqa      xmm7,[rsp+20h] 
+  movdqa      xmm0,xmm7 
+  pandn       xmm0,xmm12 
+  movdqa      xmm12,[rsp+110h] 
+  pand        xmm1,xmm10 
+  movdqa      xmm10,[rsp+70h] 
+  movdqa      [rsp+40h],xmm1 
+  movdqa      xmm1,xmm13 
+  por         xmm9,xmm0 
+  pxor        xmm0,xmm0 
+  por         xmm4,xmm6 
+  movdqa      xmm6,xmm7 
+  punpckhbw   xmm2,xmm0 
+  por         xmm15,xmm5 
+  movdqa      xmm5,[rsp+20h] 
+  movdqa      xmm0,xmm3 
+  psllw       xmm2,1 
+  pandn       xmm0,xmm11 
+  pand        xmm6,xmm4 
+  movdqa      xmm4,[rsp] 
+  paddw       xmm2,xmm11 
+  pand        xmm5,xmm15 
+  movdqa      xmm15,[rsp+20h] 
+  paddw       xmm2,xmm11 
+  paddw       xmm2,xmm11 
+  paddw       xmm2,xmm12 
+  paddw       xmm2,[rsp+10h] 
+  paddw       xmm2,[rsp] 
+  paddw       xmm2,xmm10 
+  psraw       xmm2,3 
+  pand        xmm2,xmm3 
+  por         xmm2,xmm0 
+  pand        xmm1,xmm2 
+  movdqa      xmm0,xmm13 
+  movdqa      xmm2,xmm11 
+  pandn       xmm0,xmm11 
+  paddw       xmm2,xmm12 
+  por         xmm1,xmm0 
+  packuswb    xmm9,xmm1 
+  movdqa      xmm0,xmm7 
+  movdqa      xmm7,[rsp+0A0h] 
+  pandn       xmm0,[rsp+0F0h] 
+  movdqa      xmm1,xmm3 
+  por         xmm6,xmm0 
+  movdqa      xmm0,[rsp+10h] 
+  paddw       xmm0,xmm4 
+  paddw       xmm2,xmm0 
+  paddw       xmm2,xmm7 
+  movdqa      xmm0,xmm3 
+  pandn       xmm0,xmm12 
+  psraw       xmm2,2 
+  pand        xmm1,xmm2 
+  por         xmm1,xmm0 
+  movdqa      xmm2,xmm13 
+  movdqa      xmm0,xmm13 
+  pand        xmm2,xmm1 
+  pandn       xmm0,xmm12 
+  movdqa      xmm1,xmm12 
+  paddw       xmm1,[rsp+10h] 
+  por         xmm2,xmm0 
+  movdqa      xmm0,xmm15 
+  pandn       xmm0,[rsp+0B0h] 
+  paddw       xmm1,xmm4 
+  packuswb    xmm6,xmm2 
+  movdqa      xmm2,xmm3 
+  psllw       xmm1,1 
+  por         xmm5,xmm0 
+  movdqa      xmm0,[rsp+80h] 
+  paddw       xmm0,xmm10 
+  paddw       xmm1,xmm0 
+  paddw       xmm11,xmm1 
+  psraw       xmm11,3 
+  movdqa      xmm1,xmm12 
+  pand        xmm2,xmm11 
+  paddw       xmm1,xmm12 
+  movdqa      xmm11,[rsp+80h] 
+  movdqa      xmm0, [rsp+10h] 
+  por         xmm14,[rsp+0E0h] 
+  paddw       xmm0,xmm11 
+  movdqa      xmm4,xmm15 
+  paddw       xmm1,xmm0 
+  movdqa      xmm0,xmm13 
+  paddw       xmm1,xmm7 
+  psraw       xmm1,2 
+  pandn       xmm3,xmm1 
+  por         xmm2,xmm3 
+  movdqa      xmm1,xmm13 
+  movdqa      xmm3,[rsp+10h] 
+  pandn       xmm0,xmm3 
+  pand        xmm1,xmm2 
+  movdqa      xmm2,xmm11 
+  paddw       xmm2,[rsp] 
+  por         xmm1,xmm0 
+  movdqa      xmm0,[rsp+0D0h] 
+  por         xmm0,xmm8 
+  paddw       xmm2,xmm3 
+  packuswb    xmm5,xmm1 
+  movdqa      xmm8,[rsp+40h] 
+  movdqa      xmm1,[rsp+50h] 
+  movdqa      xmm3,xmm8 
+  pand        xmm4,xmm0 
+  psllw       xmm2,1 
+  movdqa      xmm0,xmm15 
+  pandn       xmm0,[rsp+90h] 
+  por         xmm4,xmm0 
+  movdqa      xmm0,xmm12 
+  paddw       xmm0,xmm10 
+  paddw       xmm2,xmm0 
+  paddw       xmm1,xmm2 
+  movdqa      xmm0,[rsp] 
+  movdqa      xmm2,xmm11 
+  paddw       xmm0,xmm12 
+  movdqa      xmm12,[rsp] 
+  paddw       xmm2,xmm11 
+  paddw       xmm2,xmm0 
+  psraw       xmm1,3 
+  movdqa      xmm0,xmm8 
+  pand        xmm3,xmm1 
+  paddw       xmm2,xmm7 
+  movdqa      xmm1,xmm13 
+  psraw       xmm2,2 
+  pandn       xmm0,xmm2 
+  por         xmm3,xmm0 
+  movdqa      xmm2,[rsp+50h] 
+  movdqa      xmm0,xmm13 
+  pandn       xmm0,xmm12 
+  pand        xmm1,xmm3 
+  paddw       xmm2,xmm11 
+  movdqa      xmm3,xmm15 
+  por         xmm1,xmm0 
+  pand        xmm3,xmm14 
+  movdqa      xmm14,[rsp+10h] 
+  movdqa      xmm0,xmm15 
+  pandn       xmm0,[rsp+30h] 
+  packuswb    xmm4,xmm1 
+  movdqa      xmm1,xmm8 
+  por         xmm3,xmm0 
+  movdqa      xmm0,xmm12 
+  paddw       xmm0,xmm14 
+  paddw       xmm2,xmm0 
+  paddw       xmm2,xmm7 
+  movdqa      xmm0,xmm8 
+  pandn       xmm0,xmm11 
+  psraw       xmm2,2 
+  pand        xmm1,xmm2 
+  por         xmm1,xmm0 
+  movdqa      xmm2,xmm13 
+  movdqa      xmm0,xmm13 
+  pandn       xmm0,xmm11 
+  pand        xmm2,xmm1 
+  movdqa      xmm1,xmm15 
+  por         xmm2,xmm0 
+  packuswb    xmm3,xmm2 
+  movdqa      xmm0,[rsp+100h] 
+  por         xmm0,[rsp+120h] 
+  pand        xmm1,xmm0 
+  movdqa      xmm2,[rcx+rbp] 
+  movdqa      xmm7,[rsp+50h] 
+  pandn       xmm15,[rsp+60h] 
+  lea         r11,[rsp+1D8h] 
+  pxor        xmm0,xmm0 
+  por         xmm1,xmm15 
+  movaps      xmm15,[r11-0A8h] 
+  movdqa      [rdi],xmm9 
+  movaps      xmm9,[r11-48h] 
+  punpckhbw   xmm2,xmm0 
+  psllw       xmm2,1 
+  paddw       xmm2,xmm7 
+  paddw       xmm2,xmm7 
+  movdqa      [rbx],xmm6 
+  movaps      xmm6,[r11-18h] 
+  paddw       xmm2,xmm7 
+  paddw       xmm2,xmm11 
+  movaps      xmm11,[r11-68h] 
+  paddw       xmm2,xmm12 
+  movaps      xmm12,[r11-78h] 
+  paddw       xmm2,xmm14 
+  paddw       xmm2,xmm10 
+  psraw       xmm2,3 
+  movaps      xmm10,[r11-58h] 
+  movaps      xmm14,[r11-98h] 
+  movdqa      xmm0,xmm13 
+  pand        xmm2,xmm8 
+  pandn       xmm8,xmm7 
+  pandn       xmm13,xmm7 
+  por         xmm2,xmm8 
+  movaps      xmm7,[r11-28h] 
+  movaps      xmm8,[r11-38h] 
+  movdqa      [r8],xmm5 
+  pand        xmm0,xmm2 
+  por         xmm0,xmm13 
+  packuswb    xmm1,xmm0 
+  movaps      xmm13,[r11-88h] 
+  movdqa      [rbp],xmm4 
+  movdqa      [r10+rbp],xmm3 
+  movdqa      [rsi+rbp],xmm1 
+  mov         rsp,r11   
+  pop         rbp  
+  pop         rbx  
+  ret
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+ALIGN  16 
+DeblockChromaLt4V_sse2: 
+  mov         rax,rsp 
+  push        rbx  
+  push        rbp    
+  mov         r10,  rdx
+  mov         r11,  rcx
+  mov         rcx,  rdi
+  mov         rdx,  rsi  
+  mov         rsi,  r10
+  mov         r10,  r9
+  mov         rbp,  r8
+  mov         r8,   rsi
+  mov         r9,   r11
+  sub         rsp,0C8h   
+  pxor        xmm1,xmm1 
+  mov         rbx,rcx 
+  movsxd      r11,r8d 
+  movsx       ecx,byte [r10] 
+  movsx       r8d,byte [r10+2] 
+  mov         rdi,rdx 
+  movq        xmm2,[rbx] 
+  movq        xmm9,[r11+rbx] 
+  movsx       edx,byte [r10+1] 
+  mov         word [rsp+2],cx 
+  mov         word [rsp],cx 
+  movsx       eax,byte [r10+3] 
+  mov         word [rsp+6],dx 
+  mov         word [rsp+4],dx 
+  movdqa      xmm11,xmm1 
+  mov         word [rsp+0Eh],ax 
+  mov         word [rsp+0Ch],ax 
+  lea         eax,[r11+r11] 
+  movsxd      rcx,eax 
+  mov         rax,rbx 
+  mov         rdx,rdi 
+  sub         rax,rcx 
+  mov         word [rsp+0Ah],r8w 
+  mov         word [rsp+8],r8w 
+  movdqa      xmm6,[rsp] 
+  movdqa      xmm7,xmm6 
+  movq        xmm13, [rax] 
+  mov         rax,rdi 
+  sub         rax,rcx 
+  mov         rcx,rbx 
+  pcmpgtw     xmm7,xmm1 
+  psubw       xmm11,xmm6 
+  sub         rcx,r11 
+  sub         rdx,r11 
+  movq        xmm0,[rax] 
+  movsx       eax,r9w 
+  movq        xmm15,[rcx] 
+  punpcklqdq  xmm13,xmm0 
+  movq        xmm0, [rdx] 
+  movdqa      xmm4,xmm13 
+  punpcklqdq  xmm15,xmm0 
+  movq        xmm0, [rdi] 
+  punpcklbw   xmm4,xmm1 
+  movdqa      xmm12,xmm15 
+  punpcklqdq  xmm2,xmm0 
+  movq        xmm0, [r11+rdi] 
+  punpcklbw   xmm12,xmm1 
+  movdqa      xmm14,xmm2 
+  punpcklqdq  xmm9,xmm0 
+  punpckhbw   xmm2,xmm1 
+  punpcklbw   xmm14,xmm1 
+  movd        xmm0,eax 
+  mov         eax, ebp ; iBeta
+  punpckhbw   xmm13,xmm1 
+  punpckhbw   xmm15,xmm1 
+  movdqa      xmm3,xmm9 
+  movdqa      [rsp+10h],xmm2 
+  punpcklwd   xmm0,xmm0 
+  punpckhbw   xmm9,xmm1 
+  punpcklbw   xmm3,xmm1 
+  movdqa      xmm1,xmm14 
+  pshufd      xmm10,xmm0,0 
+  movd        xmm0,eax 
+  mov         eax,4 
+  cwde             
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm8,xmm0,0 
+  movd        xmm0,eax 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm5,xmm0,0 
+  psubw       xmm1,xmm12 
+  movdqa      xmm2,xmm10 
+  lea         r11,[rsp+0C8h] 
+  psllw       xmm1,2 
+  movdqa      xmm0,xmm4 
+  psubw       xmm4,xmm12 
+  psubw       xmm0,xmm3 
+  psubw       xmm3,xmm14 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm11 
+  psraw       xmm1,3 
+  pmaxsw      xmm0,xmm1 
+  pminsw      xmm6,xmm0 
+  movdqa      xmm1,xmm8 
+  movdqa      xmm0,xmm12 
+  psubw       xmm0,xmm14 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm2,xmm0 
+  pabsw       xmm0,xmm4 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm3 
+  movdqa      xmm3,[rsp] 
+  pand        xmm2,xmm1 
+  movdqa      xmm1,xmm8 
+  pcmpgtw     xmm1,xmm0 
+  movdqa      xmm0,xmm13 
+  pand        xmm2,xmm1 
+  psubw       xmm0,xmm9 
+  psubw       xmm13,xmm15 
+  pand        xmm2,xmm7 
+  pand        xmm6,xmm2 
+  paddw       xmm12,xmm6 
+  psubw       xmm14,xmm6 
+  movdqa      xmm2,[rsp+10h] 
+  movaps      xmm6,[r11-18h] 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm15 
+  psubw       xmm9,xmm2 
+  psllw       xmm1,2 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm15 
+  psubw       xmm0,xmm2 
+  psraw       xmm1,3 
+  pmaxsw      xmm11,xmm1 
+  pabsw       xmm0,xmm0 
+  movdqa      xmm1,xmm8 
+  pcmpgtw     xmm10,xmm0 
+  pabsw       xmm0,xmm13 
+  pminsw      xmm3,xmm11 
+  movaps      xmm11,[r11-68h] 
+  movaps      xmm13,[rsp+40h] 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm9 
+  movaps      xmm9, [r11-48h] 
+  pand        xmm10,xmm1 
+  pcmpgtw     xmm8,xmm0 
+  pand        xmm10,xmm8 
+  pand        xmm10,xmm7 
+  movaps      xmm8,[r11-38h] 
+  movaps      xmm7,[r11-28h] 
+  pand        xmm3,xmm10 
+  paddw       xmm15,xmm3 
+  psubw       xmm2,xmm3 
+  movaps      xmm10,[r11-58h] 
+  packuswb    xmm12,xmm15 
+  movaps      xmm15,[rsp+20h] 
+  packuswb    xmm14,xmm2 
+  movq        [rcx],xmm12 
+  movq        [rbx],xmm14 
+  psrldq      xmm12,8 
+  psrldq      xmm14,8 
+  movq        [rdx],xmm12 
+  movaps      xmm12,[r11-78h] 
+  movq        [rdi],xmm14 
+  movaps      xmm14,[rsp+30h] 
+  mov         rsp,r11 
+  pop         rbp  
+  pop         rbx  
+  ret
+
+WELS_EXTERN   DeblockChromaEq4V_sse2
+ALIGN 16
+DeblockChromaEq4V_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  push        rbp
+
+  mov         rbp, r8
+  mov         r8, rdx
+  mov         r9, rcx
+  mov         rcx, rdi
+  mov         rdx, rsi
+  
+  sub         rsp,90h 
+  pxor        xmm1,xmm1 
+  mov         r11,rcx 
+  mov         rbx,rdx 
+  mov         r10d,r9d   
+  movq        xmm13,[r11] 
+  lea         eax,[r8+r8] 
+  movsxd      r9,eax 
+  mov         rax,rcx 
+  sub         rax,r9 
+  movq        xmm14,[rax] 
+  mov         rax,rdx 
+  sub         rax,r9 
+  movq        xmm0,[rax] 
+  movsxd      rax,r8d 
+  sub         rcx,rax 
+  sub         rdx,rax 
+  movq        xmm12,[rax+r11] 
+  movq        xmm10,[rcx] 
+  punpcklqdq  xmm14,xmm0 
+  movdqa      xmm8,xmm14 
+  movq        xmm0,[rdx] 
+  punpcklbw   xmm8,xmm1 
+  punpckhbw   xmm14,xmm1 
+  punpcklqdq  xmm10,xmm0 
+  movq        xmm0,[rbx] 
+  movdqa      xmm5,xmm10 
+  punpcklqdq  xmm13,xmm0 
+  movq        xmm0, [rax+rbx] 
+  punpcklbw   xmm5,xmm1 
+  movsx       eax,r10w 
+  movdqa      xmm9,xmm13 
+  punpcklqdq  xmm12,xmm0 
+  punpcklbw   xmm9,xmm1 
+  punpckhbw   xmm10,xmm1 
+  movd        xmm0,eax 
+  mov         eax, ebp   ; iBeta
+  punpckhbw   xmm13,xmm1 
+  movdqa      xmm7,xmm12 
+  punpcklwd   xmm0,xmm0 
+  punpckhbw   xmm12,xmm1 
+  pshufd      xmm11,xmm0,0 
+  punpcklbw   xmm7,xmm1 
+  movd        xmm0,eax 
+  movdqa      xmm1,xmm8 
+  psubw       xmm1,xmm5 
+  punpcklwd   xmm0,xmm0 
+  movdqa      xmm6,xmm11 
+  pshufd      xmm3,xmm0,0 
+  movdqa      xmm0,xmm5 
+  psubw       xmm0,xmm9 
+  movdqa      xmm2,xmm3 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm6,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm3 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm0,xmm7 
+  movdqa      xmm2,xmm3 
+  psubw       xmm0,xmm9 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm1,xmm0 
+  pand        xmm6,xmm1 
+  movdqa      xmm0,xmm10 
+  movdqa      xmm1,xmm14 
+  psubw       xmm0,xmm13 
+  psubw       xmm1,xmm10 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm11,xmm0 
+  pabsw       xmm0,xmm1 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm11,xmm2 
+  movdqa      xmm0,xmm12 
+  movdqa      xmm4,xmm6 
+  movdqa      xmm1,xmm8 
+  mov         eax,2 
+  cwde             
+  paddw       xmm1,xmm8 
+  psubw       xmm0,xmm13 
+  paddw       xmm1,xmm5 
+  pabsw       xmm0,xmm0 
+  movdqa      xmm2,xmm14 
+  paddw       xmm1,xmm7 
+  pcmpgtw     xmm3,xmm0 
+  paddw       xmm2,xmm14 
+  movd        xmm0,eax 
+  pand        xmm11,xmm3 
+  paddw       xmm7,xmm7 
+  paddw       xmm2,xmm10 
+  punpcklwd   xmm0,xmm0 
+  paddw       xmm2,xmm12 
+  paddw       xmm12,xmm12 
+  pshufd      xmm3,xmm0,0 
+  paddw       xmm7,xmm9 
+  paddw       xmm12,xmm13 
+  movdqa      xmm0,xmm6 
+  paddw       xmm1,xmm3 
+  pandn       xmm0,xmm5 
+  paddw       xmm7,xmm8 
+  psraw       xmm1,2 
+  paddw       xmm12,xmm14 
+  paddw       xmm7,xmm3 
+  ;movaps      xmm14,[rsp] 
+  pand        xmm4,xmm1 
+  paddw       xmm12,xmm3 
+  psraw       xmm7,2 
+  movdqa      xmm1,xmm11 
+  por         xmm4,xmm0 
+  psraw       xmm12,2 
+  paddw       xmm2,xmm3 
+  movdqa      xmm0,xmm11 
+  pandn       xmm0,xmm10 
+  psraw       xmm2,2 
+  pand        xmm1,xmm2 
+  por         xmm1,xmm0 
+  packuswb    xmm4,xmm1 
+  movdqa      xmm0,xmm11 
+  movdqa      xmm1,xmm6 
+  pand        xmm1,xmm7 
+  movq        [rcx],xmm4 
+  pandn       xmm6,xmm9 
+  pandn       xmm11,xmm13 
+  pand        xmm0,xmm12 
+  por         xmm1,xmm6 
+  por         xmm0,xmm11 
+  psrldq      xmm4,8 
+  packuswb    xmm1,xmm0 
+  movq        [r11],xmm1 
+  psrldq      xmm1,8 
+  movq        [rdx],xmm4 
+  lea         r11,[rsp+90h] 
+  movq        [rbx],xmm1 
+  mov         rsp,r11 
+  pop         rbp
+  pop         rbx  
+  ret
+
+
+WELS_EXTERN   DeblockChromaEq4H_sse2
+ALIGN  16
+DeblockChromaEq4H_sse2:
+  mov         rax,rsp 
+  push        rbx 
+  push        rbp 
+  push        r12
+  
+  mov         rbp,   r8  
+  mov         r8,    rdx
+  mov         r9,    rcx
+  mov         rcx,   rdi
+  mov         rdx,   rsi  
+  mov         rdi,   rdx
+
+  sub         rsp,140h     
+  lea         eax,[r8*4] 
+  movsxd      r10,eax 
+  mov         eax,[rcx-2] 
+  mov         [rsp+10h],eax 
+  lea         rbx,[r10+rdx-2] 
+  lea         r11,[r10+rcx-2] 
+
+  movdqa      xmm5,[rsp+10h] 
+  movsxd      r10,r8d 
+  mov         eax,[r10+rcx-2] 
+  lea         rdx,[r10+r10*2] 
+  mov         [rsp+20h],eax 
+  mov         eax,[rcx+r10*2-2] 
+  mov         [rsp+30h],eax 
+  mov         eax,[rdx+rcx-2]
+  movdqa      xmm2,[rsp+20h] 
+  mov         [rsp+40h],eax 
+  mov         eax, [rdi-2] 
+  movdqa      xmm4,[rsp+30h] 
+  mov         [rsp+50h],eax 
+  mov         eax,[r10+rdi-2] 
+  movdqa      xmm3,[rsp+40h] 
+  mov         [rsp+60h],eax 
+  mov         eax,[rdi+r10*2-2] 
+  punpckldq   xmm5,[rsp+50h] 
+  mov         [rsp+70h],eax 
+  mov         eax, [rdx+rdi-2] 
+  punpckldq   xmm2, [rsp+60h] 
+  mov          [rsp+80h],eax 
+  mov         eax,[r11] 
+  punpckldq   xmm4, [rsp+70h] 
+  mov         [rsp+50h],eax 
+  mov         eax,[rbx] 
+  punpckldq   xmm3,[rsp+80h] 
+  mov         [rsp+60h],eax 
+  mov         eax,[r10+r11] 
+  movdqa      xmm0, [rsp+50h] 
+  punpckldq   xmm0, [rsp+60h] 
+  punpcklqdq  xmm5,xmm0 
+  movdqa      [rsp+50h],xmm0 
+  mov         [rsp+50h],eax 
+  mov         eax,[r10+rbx] 
+  movdqa      xmm0,[rsp+50h] 
+  movdqa      xmm1,xmm5 
+  mov         [rsp+60h],eax 
+  mov         eax,[r11+r10*2] 
+  punpckldq   xmm0, [rsp+60h] 
+  punpcklqdq  xmm2,xmm0 
+  punpcklbw   xmm1,xmm2 
+  punpckhbw   xmm5,xmm2 
+  movdqa      [rsp+50h],xmm0 
+  mov         [rsp+50h],eax 
+  mov         eax,[rbx+r10*2] 
+  movdqa      xmm0,[rsp+50h] 
+  mov         [rsp+60h],eax 
+  mov         eax, [rdx+r11] 
+  movdqa      xmm15,xmm1 
+  punpckldq   xmm0,[rsp+60h] 
+  punpcklqdq  xmm4,xmm0 
+  movdqa      [rsp+50h],xmm0 
+  mov         [rsp+50h],eax 
+  mov         eax, [rdx+rbx] 
+  movdqa      xmm0,[rsp+50h] 
+  mov         [rsp+60h],eax 
+  punpckldq   xmm0, [rsp+60h] 
+  punpcklqdq  xmm3,xmm0 
+  movdqa      xmm0,xmm4 
+  punpcklbw   xmm0,xmm3 
+  punpckhbw   xmm4,xmm3 
+  punpcklwd   xmm15,xmm0 
+  punpckhwd   xmm1,xmm0 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm12,xmm15 
+  punpcklwd   xmm0,xmm4 
+  punpckhwd   xmm5,xmm4 
+  punpckldq   xmm12,xmm0 
+  punpckhdq   xmm15,xmm0 
+  movdqa      xmm0,xmm1 
+  movdqa      xmm11,xmm12 
+  punpckldq   xmm0,xmm5 
+  punpckhdq   xmm1,xmm5 
+  punpcklqdq  xmm11,xmm0 
+  punpckhqdq  xmm12,xmm0 
+  movsx       eax,r9w 
+  movdqa      xmm14,xmm15 
+  punpcklqdq  xmm14,xmm1 
+  punpckhqdq  xmm15,xmm1 
+  pxor        xmm1,xmm1 
+  movd        xmm0,eax 
+  movdqa      xmm4,xmm12 
+  movdqa      xmm8,xmm11 
+  mov         eax, ebp ; iBeta
+  punpcklwd   xmm0,xmm0 
+  punpcklbw   xmm4,xmm1 
+  punpckhbw   xmm12,xmm1 
+  movdqa      xmm9,xmm14 
+  movdqa      xmm7,xmm15 
+  movdqa      xmm10,xmm15 
+  pshufd      xmm13,xmm0,0 
+  punpcklbw   xmm9,xmm1 
+  punpckhbw   xmm14,xmm1 
+  movdqa      xmm6,xmm13 
+  movd        xmm0,eax 
+  movdqa      [rsp],xmm11 
+  mov         eax,2 
+  cwde             
+  punpckhbw   xmm11,xmm1 
+  punpckhbw   xmm10,xmm1 
+  punpcklbw   xmm7,xmm1 
+  punpcklwd   xmm0,xmm0 
+  punpcklbw   xmm8,xmm1 
+  pshufd      xmm3,xmm0,0 
+  movdqa      xmm1,xmm8 
+  movdqa      xmm0,xmm4 
+  psubw       xmm0,xmm9 
+  psubw       xmm1,xmm4 
+  movdqa      xmm2,xmm3 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm6,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm3 
+  pcmpgtw     xmm2,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm0,xmm7 
+  movdqa      xmm2,xmm3 
+  psubw       xmm0,xmm9 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm1,xmm0 
+  pand        xmm6,xmm1 
+  movdqa      xmm0,xmm12 
+  movdqa      xmm1,xmm11 
+  psubw       xmm0,xmm14 
+  psubw       xmm1,xmm12 
+  movdqa      xmm5,xmm6 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm13,xmm0 
+  pabsw       xmm0,xmm1 
+  movdqa      xmm1,xmm8 
+  pcmpgtw     xmm2,xmm0 
+  paddw       xmm1,xmm8 
+  movdqa      xmm0,xmm10 
+  pand        xmm13,xmm2 
+  psubw       xmm0,xmm14 
+  paddw       xmm1,xmm4 
+  movdqa      xmm2,xmm11 
+  pabsw       xmm0,xmm0 
+  paddw       xmm2,xmm11 
+  paddw       xmm1,xmm7 
+  pcmpgtw     xmm3,xmm0 
+  paddw       xmm2,xmm12 
+  movd        xmm0,eax 
+  pand        xmm13,xmm3 
+  paddw       xmm2,xmm10 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm3,xmm0,0 
+  movdqa      xmm0,xmm6 
+  paddw       xmm1,xmm3 
+  pandn       xmm0,xmm4 
+  paddw       xmm2,xmm3 
+  psraw       xmm1,2 
+  pand        xmm5,xmm1 
+  por         xmm5,xmm0 
+  paddw       xmm7,xmm7 
+  paddw       xmm10,xmm10 
+  psraw       xmm2,2 
+  movdqa      xmm1,xmm13 
+  movdqa      xmm0,xmm13 
+  pandn       xmm0,xmm12 
+  pand        xmm1,xmm2 
+  paddw       xmm7,xmm9 
+  por         xmm1,xmm0 
+  paddw       xmm10,xmm14 
+  paddw       xmm7,xmm8 
+  movdqa      xmm0,xmm13 
+  packuswb    xmm5,xmm1 
+  paddw       xmm7,xmm3 
+  paddw       xmm10,xmm11 
+  movdqa      xmm1,xmm6 
+  paddw       xmm10,xmm3 
+  pandn       xmm6,xmm9 
+  psraw       xmm7,2 
+  pand        xmm1,xmm7 
+  psraw       xmm10,2 
+  pandn       xmm13,xmm14 
+  pand        xmm0,xmm10 
+  por         xmm1,xmm6 
+  movdqa      xmm6,[rsp] 
+  movdqa      xmm4,xmm6 
+  por         xmm0,xmm13 
+  punpcklbw   xmm4,xmm5 
+  punpckhbw   xmm6,xmm5 
+  movdqa      xmm3,xmm4 
+  packuswb    xmm1,xmm0 
+  movdqa      xmm0,xmm1 
+  punpckhbw   xmm1,xmm15 
+  punpcklbw   xmm0,xmm15 
+  punpcklwd   xmm3,xmm0 
+  punpckhwd   xmm4,xmm0 
+  movdqa      xmm0,xmm6 
+  movdqa      xmm2,xmm3 
+  punpcklwd   xmm0,xmm1 
+  punpckhwd   xmm6,xmm1 
+  movdqa      xmm1,xmm4 
+  punpckldq   xmm2,xmm0 
+  punpckhdq   xmm3,xmm0 
+  punpckldq   xmm1,xmm6 
+  movdqa      xmm0,xmm2 
+  punpcklqdq  xmm0,xmm1 
+  punpckhdq   xmm4,xmm6 
+  punpckhqdq  xmm2,xmm1 
+  movdqa      [rsp+10h],xmm0 
+  movdqa      [rsp+60h],xmm2 
+  movdqa      xmm0,xmm3 
+  mov         eax,[rsp+10h] 
+  mov         [rcx-2],eax 
+  mov         eax,[rsp+60h] 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm3,xmm4 
+  mov         [r10+rcx-2],eax 
+  movdqa      [rsp+20h],xmm0 
+  mov         eax, [rsp+20h] 
+  movdqa      [rsp+70h],xmm3 
+  mov         [rcx+r10*2-2],eax 
+  mov         eax,[rsp+70h] 
+  mov         [rdx+rcx-2],eax 
+  mov         eax,[rsp+18h] 
+  mov         [r11],eax 
+  mov         eax,[rsp+68h] 
+  mov         [r10+r11],eax 
+  mov         eax,[rsp+28h] 
+  mov         [r11+r10*2],eax 
+  mov         eax,[rsp+78h] 
+  mov         [rdx+r11],eax 
+  mov         eax,[rsp+14h] 
+  mov         [rdi-2],eax 
+  mov         eax,[rsp+64h] 
+  mov         [r10+rdi-2],eax 
+  mov         eax,[rsp+24h] 
+  mov         [rdi+r10*2-2],eax 
+  mov         eax, [rsp+74h] 
+  mov         [rdx+rdi-2],eax 
+  mov         eax, [rsp+1Ch] 
+  mov         [rbx],eax 
+  mov         eax, [rsp+6Ch] 
+  mov         [r10+rbx],eax 
+  mov         eax,[rsp+2Ch] 
+  mov         [rbx+r10*2],eax 
+  mov         eax,[rsp+7Ch] 
+  mov         [rdx+rbx],eax  
+  lea         r11,[rsp+140h] 
+  mov         rbx, [r11+28h]    
+  mov         rsp,r11
+  pop         r12
+  pop         rbp
+  pop         rbx
+  ret
+
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+ALIGN  16
+DeblockChromaLt4H_sse2:
+  mov         rax,rsp 
+  push        rbx  
+  push        rbp  
+  push        r12  
+  push        r13
+  push        r14
+  sub         rsp,170h  
+  
+  mov         r13,   r8
+  mov         r14,   r9
+  mov         r8,    rdx
+  mov         r9,    rcx
+  mov         rdx,   rdi
+  mov         rcx,   rsi
+
+  movsxd      rsi,r8d 
+  lea         eax,[r8*4] 
+  mov         r11d,r9d 
+  movsxd      r10,eax 
+  mov         eax, [rcx-2] 
+  mov         r12,rdx 
+  mov         [rsp+40h],eax 
+  mov         eax, [rsi+rcx-2] 
+  lea         rbx,[r10+rcx-2] 
+  movdqa      xmm5,[rsp+40h] 
+  mov         [rsp+50h],eax 
+  mov         eax, [rcx+rsi*2-2] 
+  lea         rbp,[r10+rdx-2] 
+  movdqa      xmm2, [rsp+50h] 
+  mov         [rsp+60h],eax 
+  lea         r10,[rsi+rsi*2] 
+  mov         rdi,rcx 
+  mov         eax,[r10+rcx-2] 
+  movdqa      xmm4,[rsp+60h] 
+  mov         [rsp+70h],eax 
+  mov         eax,[rdx-2] 
+  mov         [rsp+80h],eax 
+  mov         eax, [rsi+rdx-2] 
+  movdqa      xmm3,[rsp+70h] 
+  mov         [rsp+90h],eax 
+  mov         eax,[rdx+rsi*2-2] 
+  punpckldq   xmm5,[rsp+80h] 
+  mov         [rsp+0A0h],eax 
+  mov         eax, [r10+rdx-2] 
+  punpckldq   xmm2,[rsp+90h] 
+  mov         [rsp+0B0h],eax 
+  mov         eax, [rbx] 
+  punpckldq   xmm4,[rsp+0A0h] 
+  mov         [rsp+80h],eax 
+  mov         eax,[rbp] 
+  punpckldq   xmm3,[rsp+0B0h] 
+  mov         [rsp+90h],eax 
+  mov         eax,[rsi+rbx] 
+  movdqa      xmm0,[rsp+80h] 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm5,xmm0 
+  movdqa      [rsp+80h],xmm0 
+  mov         [rsp+80h],eax 
+  mov         eax,[rsi+rbp] 
+  movdqa      xmm0,[rsp+80h] 
+  movdqa      xmm1,xmm5 
+  mov         [rsp+90h],eax 
+  mov         eax,[rbx+rsi*2] 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm2,xmm0 
+  punpcklbw   xmm1,xmm2 
+  punpckhbw   xmm5,xmm2 
+  movdqa      [rsp+80h],xmm0 
+  mov         [rsp+80h],eax 
+  mov         eax,[rbp+rsi*2] 
+  movdqa      xmm0, [rsp+80h] 
+  mov         [rsp+90h],eax 
+  mov         eax,[r10+rbx] 
+  movdqa      xmm7,xmm1 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm4,xmm0 
+  movdqa      [rsp+80h],xmm0 
+  mov         [rsp+80h],eax 
+  mov         eax, [r10+rbp] 
+  movdqa      xmm0,[rsp+80h] 
+  mov         [rsp+90h],eax 
+  punpckldq   xmm0,[rsp+90h] 
+  punpcklqdq  xmm3,xmm0 
+  movdqa      xmm0,xmm4 
+  punpcklbw   xmm0,xmm3 
+  punpckhbw   xmm4,xmm3 
+  punpcklwd   xmm7,xmm0 
+  punpckhwd   xmm1,xmm0 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm6,xmm7 
+  punpcklwd   xmm0,xmm4 
+  punpckhwd   xmm5,xmm4 
+  punpckldq   xmm6,xmm0 
+  punpckhdq   xmm7,xmm0 
+  movdqa      xmm0,xmm1 
+  punpckldq   xmm0,xmm5 
+  mov         rax, r14    ; pTC
+  punpckhdq   xmm1,xmm5 
+  movdqa      xmm9,xmm6 
+  punpckhqdq  xmm6,xmm0 
+  punpcklqdq  xmm9,xmm0 
+  movdqa      xmm2,xmm7 
+  movdqa      xmm13,xmm6 
+  movdqa      xmm4,xmm9 
+  movdqa      [rsp+10h],xmm9 
+  punpcklqdq  xmm2,xmm1 
+  punpckhqdq  xmm7,xmm1 
+  pxor        xmm1,xmm1 
+  movsx       ecx,byte [rax+3] 
+  movsx       edx,byte [rax+2] 
+  movsx       r8d,byte [rax+1] 
+  movsx       r9d,byte [rax] 
+  movdqa      xmm10,xmm1 
+  movdqa      xmm15,xmm2 
+  punpckhbw   xmm2,xmm1 
+  punpckhbw   xmm6,xmm1 
+  punpcklbw   xmm4,xmm1 
+  movsx       eax,r11w 
+  mov         word [rsp+0Eh],cx 
+  mov         word [rsp+0Ch],cx 
+  movdqa      xmm3,xmm7 
+  movdqa      xmm8,xmm7 
+  movdqa      [rsp+20h],xmm7 
+  punpcklbw   xmm15,xmm1 
+  punpcklbw   xmm13,xmm1 
+  punpcklbw   xmm3,xmm1 
+  mov         word [rsp+0Ah],dx 
+  mov         word [rsp+8],dx 
+  mov         word [rsp+6],r8w 
+  movd        xmm0,eax 
+  movdqa      [rsp+30h],xmm6 
+  punpckhbw   xmm9,xmm1 
+  punpckhbw   xmm8,xmm1 
+  punpcklwd   xmm0,xmm0 
+  mov         eax, r13d   ; iBeta
+  mov         word [rsp+4],r8w 
+  mov         word [rsp+2],r9w 
+  pshufd      xmm12,xmm0,0 
+  mov         word [rsp],r9w 
+  movd        xmm0,eax 
+  mov         eax,4 
+  cwde             
+  movdqa      xmm14, [rsp] 
+  movdqa      [rsp],xmm2 
+  movdqa      xmm2,xmm12 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm11,xmm0,0 
+  psubw       xmm10,xmm14 
+  movd        xmm0,eax 
+  movdqa      xmm7,xmm14 
+  movdqa      xmm6,xmm14 
+  pcmpgtw     xmm7,xmm1 
+  punpcklwd   xmm0,xmm0 
+  pshufd      xmm5,xmm0,0 
+  movdqa      xmm0,xmm4 
+  movdqa      xmm1,xmm15 
+  psubw       xmm4,xmm13 
+  psubw       xmm0,xmm3 
+  psubw       xmm1,xmm13 
+  psubw       xmm3,xmm15 
+  psllw       xmm1,2 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm10 
+  psraw       xmm1,3 
+  pmaxsw      xmm0,xmm1 
+  pminsw      xmm6,xmm0 
+  movdqa      xmm1,xmm11 
+  movdqa      xmm0,xmm13 
+  psubw       xmm0,xmm15 
+  pabsw       xmm0,xmm0 
+  pcmpgtw     xmm2,xmm0 
+  pabsw       xmm0,xmm4 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm3 
+  pand        xmm2,xmm1 
+  movdqa      xmm1,xmm11 
+  movdqa      xmm3,[rsp+30h] 
+  pcmpgtw     xmm1,xmm0 
+  movdqa      xmm0,xmm9 
+  pand        xmm2,xmm1 
+  psubw       xmm0,xmm8 
+  psubw       xmm9,xmm3 
+  pand        xmm2,xmm7 
+  pand        xmm6,xmm2 
+  psubw       xmm15,xmm6 
+  paddw       xmm13,xmm6 
+  movdqa      xmm2,[rsp] 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  psubw       xmm8,xmm2 
+  psllw       xmm1,2 
+  paddw       xmm1,xmm0 
+  paddw       xmm1,xmm5 
+  movdqa      xmm0,xmm3 
+  movdqa      xmm5,[rsp+10h] 
+  psubw       xmm0,xmm2 
+  psraw       xmm1,3 
+  movdqa      xmm4,xmm5 
+  pabsw       xmm0,xmm0 
+  pmaxsw      xmm10,xmm1 
+  movdqa      xmm1,xmm11 
+  pcmpgtw     xmm12,xmm0 
+  pabsw       xmm0,xmm9 
+  pminsw      xmm14,xmm10 
+  pcmpgtw     xmm1,xmm0 
+  pabsw       xmm0,xmm8 
+  pcmpgtw     xmm11,xmm0 
+  pand        xmm12,xmm1 
+  movdqa      xmm1,[rsp+20h] 
+  pand        xmm12,xmm11 
+  pand        xmm12,xmm7 
+  pand        xmm14,xmm12 
+  paddw       xmm3,xmm14 
+  psubw       xmm2,xmm14 
+  packuswb    xmm13,xmm3 
+  packuswb    xmm15,xmm2 
+  punpcklbw   xmm4,xmm13 
+  punpckhbw   xmm5,xmm13 
+  movdqa      xmm0,xmm15 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm15,xmm1 
+  movdqa      xmm3,xmm4 
+  punpcklwd   xmm3,xmm0 
+  punpckhwd   xmm4,xmm0 
+  movdqa      xmm0,xmm5 
+  movdqa      xmm2,xmm3 
+  movdqa      xmm1,xmm4 
+  punpcklwd   xmm0,xmm15 
+  punpckhwd   xmm5,xmm15 
+  punpckldq   xmm2,xmm0 
+  punpckhdq   xmm3,xmm0 
+  punpckldq   xmm1,xmm5 
+  movdqa      xmm0,xmm2 
+  punpcklqdq  xmm0,xmm1 
+  punpckhdq   xmm4,xmm5 
+  punpckhqdq  xmm2,xmm1 
+  movdqa      [rsp+40h],xmm0 
+  movdqa      xmm0,xmm3 
+  movdqa      [rsp+90h],xmm2 
+  mov         eax,[rsp+40h] 
+  mov         [rdi-2],eax 
+  mov         eax, [rsp+90h] 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm3,xmm4 
+  mov         [rsi+rdi-2],eax 
+  movdqa      [rsp+50h],xmm0 
+  mov         eax,[rsp+50h] 
+  movdqa      [rsp+0A0h],xmm3 
+  mov         [rdi+rsi*2-2],eax 
+  mov         eax,[rsp+0A0h] 
+  mov         [r10+rdi-2],eax 
+  mov         eax,[rsp+48h] 
+  mov         [rbx],eax 
+  mov         eax,[rsp+98h] 
+  mov         [rsi+rbx],eax 
+  mov         eax,[rsp+58h] 
+  mov         [rbx+rsi*2],eax 
+  mov         eax, [rsp+0A8h] 
+  mov         [r10+rbx],eax 
+  mov         eax, [rsp+44h] 
+  mov         [r12-2],eax 
+  mov         eax,[rsp+94h] 
+  mov         [rsi+r12-2],eax 
+  mov         eax,[rsp+54h] 
+  mov         [r12+rsi*2-2],eax 
+  mov         eax, [rsp+0A4h] 
+  mov         [r10+r12-2],eax 
+  mov         eax,[rsp+4Ch] 
+  mov         [rbp],eax 
+  mov         eax,[rsp+9Ch] 
+  mov         [rsi+rbp],eax 
+  mov         eax, [rsp+5Ch] 
+  mov         [rbp+rsi*2],eax 
+  mov         eax,[rsp+0ACh] 
+  mov         [r10+rbp],eax   
+  lea         r11,[rsp+170h]    
+  mov         rsp,r11 
+  pop         r14
+  pop         r13
+  pop         r12  
+  pop         rbp  
+  pop         rbx  
+  ret 
+
+
+
+%elifdef  X86_32
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+
+DeblockChromaEq4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4H_sse2
+
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+    
+%endif
+
+
+
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push     r3 
+    push     r4  
+    push     r5
+
+%assign   push_num   3 
+    LOAD_3_PARA    
+
+    SIGN_EXTENTION   r1, r1d
+
+    mov      r5,    r7 
+    mov      r3,    r7
+    and      r3,    0Fh
+    sub      r7,    r3
+    sub      r7,    10h
+
+    lea      r3,    [r0 + r1 * 8]
+    lea      r4,    [r1 * 3]
+
+    movq    xmm0,  [r0]
+    movq    xmm7,  [r3]
+    punpcklqdq   xmm0,  xmm7
+    movq    xmm1,  [r0 + r1]
+    movq    xmm7,  [r3 + r1]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [r0 + r1*2]
+    movq    xmm7,  [r3 + r1*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [r0 + r4]
+    movq    xmm7,  [r3 + r4]
+    punpcklqdq   xmm3,  xmm7
+
+    lea     r0,   [r0 + r1 * 4]
+    lea     r3,   [r3 + r1 * 4]
+    movq    xmm4,  [r0]
+    movq    xmm7,  [r3]
+    punpcklqdq   xmm4,  xmm7
+    movq    xmm5,  [r0 + r1]
+    movq    xmm7,  [r3 + r1]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [r0 + r1*2]
+    movq    xmm7,  [r3 + r1*2]
+    punpcklqdq   xmm6,  xmm7
+
+    movdqa  [r7],   xmm0
+    movq    xmm7,  [r0 + r4]
+    movq    xmm0,  [r3 + r4]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [r7]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+   
+    movdqa  [r2],    xmm4
+    movdqa  [r2 + 10h],  xmm2
+    movdqa  [r2 + 20h],  xmm3
+    movdqa  [r2 + 30h],  xmm7
+    movdqa  [r2 + 40h],  xmm5
+    movdqa  [r2 + 50h],  xmm1
+    movdqa  [r2 + 60h],  xmm6
+    movdqa  [r2 + 70h],  xmm0
+
+    mov     r7,   r5
+    pop     r5
+    pop     r4
+    pop     r3
+    ret
+
+
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     r3
+    push     r4 
+
+%assign  push_num 2
+    LOAD_3_PARA
+
+    SIGN_EXTENTION   r1, r1d 
+
+    mov      r4,    r7
+    mov      r3,    r7 
+    and      r3,    0Fh
+    sub      r7,    r3 
+    sub      r7,    10h
+
+    movdqa   xmm0,   [r2]
+    movdqa   xmm1,   [r2 + 10h]
+    movdqa   xmm2,   [r2 + 20h]
+    movdqa   xmm3,   [r2 + 30h]
+    movdqa   xmm4,   [r2 + 40h]
+    movdqa   xmm5,   [r2 + 50h]
+    movdqa   xmm6,   [r2 + 60h]
+    movdqa   xmm7,   [r2 + 70h]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    lea      r2,   [r1 * 3]
+
+    movq     [r0],  xmm4
+    movq     [r0 + r1],  xmm2
+    movq     [r0 + r1*2],  xmm3
+    movq     [r0 + r2],  xmm7
+
+    lea      r0,   [r0 + r1*4]
+    movq     [r0],  xmm5
+    movq     [r0 + r1],  xmm1
+    movq     [r0 + r1*2],  xmm6
+    movq     [r0 + r2],  xmm0
+
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+
+    lea       r0,  [r0 + r1*4]
+    movq     [r0],  xmm4
+    movq     [r0 + r1],  xmm2
+    movq     [r0 + r1*2],  xmm3
+    movq     [r0 + r2],  xmm7
+
+    lea      r0,   [r0 + r1*4]
+    movq     [r0],  xmm5
+    movq     [r0 + r1],  xmm1
+    movq     [r0 + r1*2],  xmm6
+    movq     [r0 + r2],  xmm0
+
+
+    mov      r7,   r4
+    pop      r4
+    pop      r3
+    ret
+
--- /dev/null
+++ b/codec/common/expand_picture.asm
@@ -1,0 +1,740 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  expand_picture.asm
+;*
+;*  Abstract
+;*      mmxext/sse for expand_frame
+;*
+;*  History
+;*      09/25/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata pData align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+;%define PADDING_SIZE_ASM 	32 	; PADDING_LENGTH
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+
+SECTION .text
+
+WELS_EXTERN ExpandPictureLuma_sse2
+WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
+WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
+
+;;;;;;;expanding result;;;;;;;
+
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;----------------------------
+;aaaa|attttttttttttttttb|bbbb
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;----------------------------
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+
+%macro mov_line_8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
+    ;r2 [width/16(8)]
+    ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
+    ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
+
+%if %1 == 32		; for luma
+	sar r2, 04h 	; width / 16(8) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [r0]		; first line of picture pData
+	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+	; bottom
+	movdqa xmm1, [r3] 		; last line of picture pData
+	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+	lea r0, [r0+16]		; top pSrc
+	lea r5, [r5+16]		; top dst
+	lea r3, [r3+16]		; bottom pSrc
+	lea r4, [r4+16]		; bottom dst
+	neg r1 			; positive/negative stride need for next loop?
+
+	dec r2
+	jnz near .top_bottom_loops
+%elif %1 == 16	; for chroma ??
+	mov r6, r2
+	sar r2, 04h 	; (width / 16) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [r0]		; first line of picture pData
+	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+	; bottom
+	movdqa xmm1, [r3] 		; last line of picture pData
+	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+	lea r0, [r0+16]		; top pSrc
+	lea r5, [r5+16]		; top dst
+	lea r3, [r3+16]		; bottom pSrc
+	lea r4, [r4+16]		; bottom dst
+	neg r1 			; positive/negative stride need for next loop?
+
+	dec r2
+	jnz near .top_bottom_loops
+
+	; for remaining 8 bytes
+	and r6, 0fh		; any 8 bytes left?
+	test r6, r6
+	jz near .to_be_continued	; no left to exit here
+
+	; top
+	movq mm0, [r0]		; remained 8 byte
+	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	mov_line_end8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	; bottom
+	movq mm1, [r3]
+	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	mov_line_end8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	WELSEMMS
+
+.to_be_continued:
+%endif
+%endmacro
+
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+    ;r6 [height]
+    ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
+    ;r3 [pSrc+(w-1)] r4[pSrc+w] 
+
+%if %1 == 32		; for luma
+.left_right_loops:
+	; left
+	movzx r2d, byte [r0]		; pixel pData for left border
+	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [r5], xmm0
+	movdqa [r5+16], xmm0
+
+	; right
+	movzx r2d, byte [r3]
+	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [r4], xmm1
+	movdqa [r4+16], xmm1
+
+	lea r0, [r0+r1]		; left pSrc
+	lea r5, [r5+r1]		; left dst
+	lea r3, [r3+r1]		; right pSrc
+	lea r4, [r4+r1]		; right dst
+
+	dec r6
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
+.left_right_loops:
+	; left
+	movzx r2d, byte [r0]		; pixel pData for left border
+	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [r5], xmm0
+
+	; right
+	movzx r2d, byte [r3]
+	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdq%2 [r4], xmm1								; might not be aligned 16 bytes in case chroma planes
+
+	lea r0, [r0+r1]		; left pSrc
+	lea r5, [r5+r1]		; left dst
+	lea r3, [r3+r1]		; right pSrc
+	lea r4, [r4+r1]		; right dst
+
+	dec r6
+	jnz near .left_right_loops
+%endif
+%endmacro
+
+%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+    ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
+%if %1 == 32		; luma
+	; TL
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+
+	; TR
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+
+	; BL
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+
+	; BR
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+%elif %1 == 16	; chroma
+	; TL
+	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+	mov_line_end16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+
+	; TR
+	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
+	mov_line_end16x4_sse2 r4, r1, xmm4, %2	; dst, stride, xmm?
+
+	; BL
+	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+	mov_line_end16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+
+	; BR
+	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+	mov_line_end16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+%endif
+%endmacro
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureLuma_sse2(	uint8_t *pDst,
+;									const int32_t iStride,
+;									const int32_t iWidth,
+;									const int32_t iHeight	);
+;***********************************************************************----------------
+ExpandPictureLuma_sse2:
+
+    push r4
+    push r5
+    push r6
+
+    %assign push_num 3
+    LOAD_4_PARA
+    
+    SIGN_EXTENTION r1, r1d
+    SIGN_EXTENTION r2, r2d
+    SIGN_EXTENTION r3, r3d
+
+    ;also prepare for cross border pData top-left:xmm3
+    
+    movzx r6d,byte[r0]
+    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
+
+    neg r1
+    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
+    neg r1
+
+    push r3
+
+
+    dec r3                      ;h-1
+    imul r3,r1                  ;(h-1)*stride
+    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
+    
+    mov r6,r1                    ;r6 = stride
+    sal r6,05h                   ;r6 = 32*stride
+    lea r4,[r3+r6]               ;r4 = dst bottom
+    
+    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+    
+    movzx r6d,byte [r3]             ;bottom-left
+    SSE2_Copy16Times xmm5,r6d
+    
+    lea r6,[r3+r2-1]
+    movzx r6d,byte [r6]
+    SSE2_Copy16Times xmm6,r6d ;bottom-right
+    
+    neg r1  ;r1 = -stride
+    
+    push r0
+    push r1
+    push r2
+
+    exp_top_bottom_sse2 32
+
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    
+    pop r2
+    pop r1
+    pop r0
+
+    lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
+    
+    lea r3,[r0+r2-1]                        ;right border src
+    lea r4,[r3+1]                           ;right border dst
+
+    ;prepare for cross border data: top-rigth with xmm4
+     movzx r6d,byte [r3]                         ;top -rigth
+     SSE2_Copy16Times xmm4,r6d
+    
+    neg r1   ;r1 = stride
+
+
+    pop r6  ;  r6 = height
+
+
+
+    push r0
+    push r1
+    push r2
+    push r6
+    
+    exp_left_right_sse2  32,a
+
+    pop r6
+    pop r2
+    pop r1
+    pop r0
+
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+    
+    neg r1  ;r1 = -stride
+    lea r3,[r0-32]
+    lea r3,[r3+r1]    ;last line of top-left border
+    
+    lea r4,[r0+r2]    ;psrc +width
+    lea r4,[r4+r1]    ;psrc +width -stride
+    
+    
+    neg r1  ;r1 = stride
+    add r6,32         ;height +32(16) ,luma = 32, chroma = 16
+    imul r6,r1
+    
+    lea r5,[r3+r6]    ;last line of bottom-left border
+    lea r6,[r4+r6]    ;last line of botoom-right border
+    
+    neg r1 ; r1 = -stride
+
+    ; for left & right border expanding
+    exp_cross_sse2 32,a
+
+    LOAD_4_PARA_POP
+    
+    pop r6
+    pop r5
+    pop r4
+    
+    %assign push_num 0
+
+
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
+;										const int32_t iStride,
+;										const int32_t iWidth,
+;										const int32_t iHeight	);
+;***********************************************************************----------------
+ExpandPictureChromaAlign_sse2:
+	
+    push r4
+    push r5
+    push r6
+
+    %assign push_num 3
+    LOAD_4_PARA
+
+    SIGN_EXTENTION r1,r1d
+    SIGN_EXTENTION r2,r2d
+    SIGN_EXTENTION r3,r3d
+
+    ;also prepare for cross border pData top-left:xmm3
+    
+    movzx r6d,byte [r0]
+    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
+
+    neg r1
+    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
+    neg r1
+
+    push r3
+
+
+    dec r3                      ;h-1
+    imul r3,r1                  ;(h-1)*stride
+    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
+    
+    mov r6,r1                    ;r6 = stride
+    sal r6,04h                   ;r6 = 32*stride
+    lea r4,[r3+r6]               ;r4 = dst bottom 
+    
+    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+    
+    movzx r6d,byte [r3]             ;bottom-left
+    SSE2_Copy16Times xmm5,r6d
+    
+    lea r6,[r3+r2-1]
+    movzx r6d,byte [r6]
+    SSE2_Copy16Times xmm6,r6d ;bottom-right
+    
+    neg r1  ;r1 = -stride
+    
+    push r0
+    push r1 
+    push r2
+
+    exp_top_bottom_sse2 16
+
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    
+    pop r2
+    pop r1
+    pop r0
+
+    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
+    
+    lea r3,[r0+r2-1]                        ;right border src 
+    lea r4,[r3+1]                           ;right border dst
+
+    ;prepare for cross border data: top-rigth with xmm4
+    movzx r6d,byte [r3]                         ;top -rigth
+    SSE2_Copy16Times xmm4,r6d
+    
+    neg r1   ;r1 = stride
+
+
+    pop r6  ;  r6 = height
+
+
+
+    push r0
+    push r1 
+    push r2
+	push r6
+    exp_left_right_sse2 16,a
+
+    pop r6
+    pop r2
+    pop r1
+    pop r0
+
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+    
+    neg r1  ;r1 = -stride
+    lea r3,[r0-16]
+    lea r3,[r3+r1]    ;last line of top-left border
+    
+    lea r4,[r0+r2]    ;psrc +width
+    lea r4,[r4+r1]    ;psrc +width -stride  
+    
+    
+    neg r1  ;r1 = stride
+    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
+    imul r6,r1
+    
+    lea r5,[r3+r6]    ;last line of bottom-left border
+    lea r6,[r4+r6]    ;last line of botoom-right border
+    
+    neg r1 ; r1 = -stride
+
+    ; for left & right border expanding
+    exp_cross_sse2 16,a
+
+    LOAD_4_PARA_POP
+    
+    pop r6
+    pop r5
+    pop r4
+    
+    %assign push_num 0
+
+
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
+;										const int32_t iStride,
+;										const int32_t iWidth,
+;										const int32_t iHeight	);
+;***********************************************************************----------------
+ExpandPictureChromaUnalign_sse2:
+	push r4
+    push r5
+    push r6
+
+    %assign push_num 3
+    LOAD_4_PARA
+
+    SIGN_EXTENTION r1,r1d
+    SIGN_EXTENTION r2,r2d
+    SIGN_EXTENTION r3,r3d
+
+    ;also prepare for cross border pData top-left:xmm3
+    
+    movzx r6d,byte [r0]
+    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
+
+    neg r1
+    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
+    neg r1
+
+    push r3
+
+
+    dec r3                      ;h-1
+    imul r3,r1                  ;(h-1)*stride
+    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
+    
+    mov r6,r1                    ;r6 = stride
+    sal r6,04h                   ;r6 = 32*stride
+    lea r4,[r3+r6]               ;r4 = dst bottom 
+    
+    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+    
+    movzx r6d,byte [r3]             ;bottom-left
+    SSE2_Copy16Times xmm5,r6d
+    
+    lea r6,[r3+r2-1]
+    movzx r6d,byte [r6]
+    SSE2_Copy16Times xmm6,r6d ;bottom-right
+    
+    neg r1  ;r1 = -stride
+    
+    push r0
+    push r1 
+    push r2
+
+    exp_top_bottom_sse2 16
+
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    
+    pop r2
+    pop r1
+    pop r0
+
+    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
+    
+    lea r3,[r0+r2-1]                        ;right border src 
+    lea r4,[r3+1]                           ;right border dst
+
+    ;prepare for cross border data: top-rigth with xmm4
+    movzx r6d,byte [r3]                         ;top -rigth
+    SSE2_Copy16Times xmm4,r6d
+    
+    neg r1   ;r1 = stride
+
+
+    pop r6  ;  r6 = height
+
+
+
+    push r0
+    push r1 
+    push r2
+	push r6
+    exp_left_right_sse2 16,u
+
+    pop r6
+    pop r2
+    pop r1
+    pop r0
+
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+    
+    neg r1  ;r1 = -stride
+    lea r3,[r0-16]
+    lea r3,[r3+r1]    ;last line of top-left border
+    
+    lea r4,[r0+r2]    ;psrc +width
+    lea r4,[r4+r1]    ;psrc +width -stride  
+    
+    
+    neg r1  ;r1 = stride
+    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
+    imul r6,r1
+    
+    lea r5,[r3+r6]    ;last line of bottom-left border
+    lea r6,[r4+r6]    ;last line of botoom-right border
+    
+    neg r1 ; r1 = -stride
+
+    ; for left & right border expanding
+    exp_cross_sse2 16,u
+
+    LOAD_4_PARA_POP
+    
+    pop r6
+    pop r5
+    pop r4
+    
+    %assign push_num 0
+
+
+	ret
+    
\ No newline at end of file
--- /dev/null
+++ b/codec/common/mb_copy.asm
@@ -1,0 +1,701 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mb_copy.asm
+;*
+;*  Abstract
+;*      mb_copy and mb_copy1
+;*
+;*  History
+;*      15/09/2009 Created
+;*		12/28/2009 Modified with larger throughput
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN WelsCopy16x16_sse2
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+WELS_EXTERN WelsCopy8x8_mmx
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
+WELS_EXTERN WelsCopy8x16_mmx		;
+WELS_EXTERN UpdateMbMv_sse2		;
+
+;***********************************************************************
+; void WelsCopy16x16_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x16_sse2:
+
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+2*r3]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+2*r3]
+	movdqa xmm7, [r2+r5]
+	lea r2, [r2+4*r3]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	lea r0, [r0+4*r1]
+
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+2*r3]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+2*r3]
+	movdqa xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4	
+	ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WelsCopy16x16NotAligned_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	;mov edi, [esp+16]	; Dst
+	;mov eax, [esp+20]	; iStrideD
+	;mov esi, [esp+24]	; Src
+	;mov ecx, [esp+28]	; iStrideS
+	
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+	lea r2, [r2+4*r3]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	lea r0, [r0+4*r1]
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x8NotAligned_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	;mov edi, [esp+16]	; Dst
+	;mov eax, [esp+20]	; iStrideD
+	;mov esi, [esp+24]	; Src
+	;mov ecx, [esp+28]	; iStrideS
+	
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4	
+	ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+;                       int32_t  iStrideD,
+;                       uint8_t* Src,
+;                       int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x16_mmx:
+	;push ebx
+
+	;mov eax, [esp + 8 ]           ;Dst
+	;mov ecx, [esp + 12]           ;iStrideD
+	;mov ebx, [esp + 16]           ;Src
+	;mov edx, [esp + 20]           ;iStrideS
+	
+	%assign  push_num 0
+    LOAD_4_PARA
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+	lea r2, [r2+2*r3]
+	
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+	lea r0, [r0+2*r1]
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+	
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+
+	WELSEMMS
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+; void WelsCopy8x8_mmx(  uint8_t* Dst,
+;                        int32_t  iStrideD,
+;                        uint8_t* Src,
+;                        int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x8_mmx:
+	;push ebx
+	;push esi
+	;mov eax, [esp + 12]           ;Dst
+	;mov ecx, [esp + 16]           ;iStrideD
+	;mov esi, [esp + 20]           ;Src
+	;mov ebx, [esp + 24]           ;iStrideS
+	
+	push r4
+	%assign  push_num 1
+    LOAD_4_PARA
+	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
+
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+
+	WELSEMMS
+	;pop esi
+	;pop ebx	
+	LOAD_4_PARA_POP
+	pop r4
+	ret
+
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+ALIGN 16
+UpdateMbMv_sse2:
+
+    %assign  push_num 0
+    LOAD_2_PARA
+    
+	;mov eax, [esp+4]	; mv_buffer
+	;movd xmm0, [esp+8]	; _mv
+	movd xmm0, r1d	; _mv
+	pshufd xmm1, xmm0, $0
+	movdqa [r0     ], xmm1
+	movdqa [r0+0x10], xmm1
+	movdqa [r0+0x20], xmm1
+	movdqa [r0+0x30], xmm1
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+;SECTION .rodata data align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN PixelAvgWidthEq4_mmx
+WELS_EXTERN PixelAvgWidthEq8_mmx
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+WELS_EXTERN McCopyWidthEq4_mmx
+WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq16_sse2
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq4_mmx:
+ 
+    %assign  push_num 0
+    LOAD_7_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+	movsx	r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+	movd        mm0, [r4]
+    pavgb       mm0, [r2]
+    movd        [r0], mm0
+
+    dec         r6
+    lea         r0, [r0+r1]
+    lea         r2, [r2+r3]
+    lea         r4, [r4+r5]
+    jne         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq8_mmx:
+
+    ;push        esi
+    ;push        edi
+    ;push        ebp
+    ;push        ebx
+
+    ;mov         edi, [esp+20]       ; pDst
+    ;mov         eax, [esp+24]       ; iDstStride
+    ;mov         esi, [esp+28]       ; pSrcA
+    ;mov         ecx, [esp+32]       ; iSrcAStride
+    ;mov         ebp, [esp+36]       ; pSrcB
+    ;mov         edx, [esp+40]       ; iSrcBStride
+    ;mov         ebx, [esp+44]       ; iHeight
+    
+    %assign  push_num 0
+    LOAD_7_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+	movsx	r6, r6d
+%endif
+    
+ALIGN 4
+.height_loop:
+	movq        mm0, [r2]
+    pavgb       mm0, [r4]
+    movq        [r0], mm0
+    movq        mm0, [r2+r3]
+    pavgb       mm0, [r4+r5]
+    movq		[r0+r1], mm0
+
+    lea			r2,  [r2+2*r3]
+    lea			r4,  [r4+2*r5]
+    lea			r0,  [r0+2*r1]
+
+    sub         r6, 2
+    jnz         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
+;                          uint8_t *pSrcA, int iSrcAStride,
+;                          uint8_t *pSrcB, int iSrcBStride,
+;                          int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq16_sse2:
+        
+    %assign  push_num 0
+    LOAD_7_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+	movsx	r6, r6d
+%endif
+ALIGN 4
+.height_loop:
+	movdqu      xmm0, [r2]
+	movdqu	    xmm1, [r4]
+	pavgb	    xmm0, xmm1
+	;pavgb       xmm0, [r4]
+    movdqu      [r0], xmm0
+
+	movdqu      xmm0, [r2+r3]
+	movdqu      xmm1, [r4+r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+r1], xmm0
+
+	movdqu      xmm0, [r2+2*r3]
+	movdqu       xmm1, [r4+2*r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+2*r1], xmm0
+
+    lea         r2, [r2+2*r3]
+    lea			r4, [r4+2*r5]
+    lea			r0, [r0+2*r1]
+
+	movdqu      xmm0, [r2+r3]
+	movdqu      xmm1, [r4+r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+r1], xmm0
+
+    lea         r2, [r2+2*r3]
+    lea			r4, [r4+2*r5]
+    lea			r0, [r0+2*r1]
+    
+    sub         r6, 4
+    jne         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+ALIGN 16
+;*******************************************************************************
+;  void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+;                          uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq4_mmx:
+    ;push    esi
+    ;push    edi
+    ;push    ebx
+
+
+    ;mov esi,  [esp+16]
+    ;mov eax, [esp+20]
+    ;mov edi,  [esp+24]
+    ;mov ecx,  [esp+28]
+    ;mov edx,  [esp+32]
+    
+    push	r5
+    %assign  push_num 1
+    LOAD_5_PARA
+   
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+    
+ALIGN 4
+.height_loop:
+	mov r5d, [r0]
+	mov [r2], r5d
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+	WELSEMMS
+    LOAD_5_PARA_POP
+    pop	   r5
+    ret
+
+ALIGN 16
+;*******************************************************************************
+;   void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+;                           uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq8_mmx:
+    ;push  esi
+    ;push  edi
+	;mov  esi, [esp+12]
+	;mov eax, [esp+16]
+	;mov edi, [esp+20]
+	;mov ecx, [esp+24]
+	;mov edx, [esp+28]
+	
+    %assign  push_num 0
+    LOAD_5_PARA
+
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+	movq mm0, [r0]
+	movq [r2], mm0
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+
+	WELSEMMS
+	LOAD_5_PARA_POP
+    ret
+
+
+ALIGN 16
+;*******************************************************************************
+;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+	movq	%1, [%2]
+	movhps	%1,	[%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+	movq	[%1],	%2
+	movhps	[%1+8], %2
+%endmacro
+McCopyWidthEq16_sse2:
+    ;push    esi
+    ;push    edi
+
+    ;mov     esi, [esp+12]       ; pSrc
+    ;mov     eax, [esp+16]       ; iSrcStride
+    ;mov     edi, [esp+20]       ; pDst
+    ;mov     edx, [esp+24]       ; iDstStride
+    ;mov     ecx, [esp+28]       ; iHeight
+
+    %assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+ALIGN 4
+.height_loop:
+    SSE_READ_UNA	xmm0, r0
+    SSE_READ_UNA	xmm1, r0+r1
+    SSE_WRITE_UNA	r2, xmm0
+    SSE_WRITE_UNA	r2+r3, xmm1
+
+	sub		r4,	2
+    lea     r0, [r0+r1*2]
+    lea     r2, [r2+r3*2]
+    jnz     .height_loop
+
+	LOAD_5_PARA_POP
+    ret
--- /dev/null
+++ b/codec/common/mc_chroma.asm
@@ -1,0 +1,345 @@
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	;push esi
+	;push edi
+	;push ebx
+	
+	%assign  push_num 0
+	LOAD_6_PARA 
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+%endif
+	
+	;mov eax, [esp +12 + 20]
+	
+	movd mm3, [r4];	[eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	;mov esi, [esp +12+ 4]
+	;mov eax, [esp + 12 + 8]
+	;mov edi, [esp + 12 + 12]
+	;mov edx, [esp + 12 + 16]
+    ;mov ecx, [esp + 12 + 24]
+
+	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+	movd mm0, [r0]
+	movd mm1, [r0+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [r4]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [r4+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [r2], mm0
+
+	movq mm0, mm2
+
+	lea r2, [r2 + r3]
+	lea r4, [r4 + r1]
+
+	dec r5
+	jnz near .xloop
+	WELSEMMS
+	LOAD_6_PARA_POP
+	;pop ebx
+	;pop edi
+	;pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	%assign  push_num 0
+	LOAD_6_PARA 	
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+%endif
+
+	;mov eax, [esp +12 + 20]
+	movd xmm3, [r4]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	;mov esi, [esp +12+ 4]
+	;mov eax, [esp + 12 + 8]
+	;mov edi, [esp + 12 + 12]
+	;mov edx, [esp + 12 + 16]
+    ;mov ecx, [esp + 12 + 24]
+
+	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+	movq xmm0, [r0]
+	movq xmm1, [r0+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [r4]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [r4+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea r2, [r2 + r3]
+	lea r4, [r4 + r1]
+
+	dec r5
+	jnz near .xloop
+	
+	LOAD_6_PARA_POP
+
+	;pop ebx
+	;pop edi
+	;pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	;push ebx
+	;push esi
+	;push edi
+	%assign  push_num 0
+	LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r5, r5d
+%endif
+	
+	;mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [r4]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+	;mov eax, [esp + 12 + 4]
+	;mov edx, [esp + 12 + 8]
+	;mov esi, [esp + 12 + 12]
+	;mov edi, [esp + 12 + 16]
+    ;mov ecx, [esp + 12 + 24]
+
+    sub r2, r3 ;sub esi, edi
+    sub r2, r3
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [r0]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	r2, [r2+2*r3]
+
+	movdqu xmm2, [r0+r1]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [r2],xmm0
+
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [r2+r3],xmm4
+
+	sub r5, 2
+	jnz .hloop_chroma
+	
+	LOAD_6_PARA_POP
+	
+	;pop edi
+	;pop esi
+	;pop ebx
+
+	ret
+
+
--- /dev/null
+++ b/codec/common/mc_luma.asm
@@ -1,0 +1,1293 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_luma.asm
+;*
+;*  Abstract
+;*      sse2 motion compensation
+;*
+;*  History
+;*      17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+	dw 16, 16, 16, 16	
+ALIGN 16
+h264_w0x10_1:
+	dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+	dw 32, 32, 32, 32, 32, 32, 32, 32
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20WidthEq4_mmx
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
+;						int iHeight)
+;*******************************************************************************
+McHorVer20WidthEq4_mmx:
+	;push esi
+	;push edi
+
+	;mov esi, [esp+12]
+	;mov eax, [esp+16]
+	;mov edi, [esp+20]
+	;mov ecx, [esp+24]
+	;mov edx, [esp+28]
+	
+    %assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+    
+	sub r0, 2
+	WELS_Zero mm7
+	movq mm6, [h264_w0x10]
+.height_loop:
+	movd mm0, [r0]
+	punpcklbw mm0, mm7
+	movd mm1, [r0+5]
+	punpcklbw mm1, mm7
+	movd mm2, [r0+1]
+	punpcklbw mm2, mm7
+	movd mm3, [r0+4]
+	punpcklbw mm3, mm7
+	movd mm4, [r0+2]
+	punpcklbw mm4, mm7
+	movd mm5, [r0+3]
+	punpcklbw mm5, mm7
+
+	paddw mm2, mm3
+	paddw mm4, mm5
+	psllw mm4, 2
+	psubw mm4, mm2
+	paddw mm0, mm1
+	paddw mm0, mm4
+	psllw mm4, 2
+	paddw mm0, mm4
+	paddw mm0, mm6
+	psraw mm0, 5
+	packuswb mm0, mm7
+	movd [r2], mm0
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+
+	WELSEMMS
+	LOAD_5_PARA_POP
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+	movq %1, %3
+	punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+	paddw	%1, %6
+	movdqa	%8, %3
+	movdqa	%7, %2
+	paddw	%1, [h264_w0x10_1]
+	paddw	%8, %4
+	paddw	%7, %5
+	psllw	%8, 2
+	psubw	%8, %7
+	paddw	%1, %8
+	psllw	%8, 2
+	paddw	%1, %8
+	psraw   %1, 5
+	WELS_Zero %8
+	packuswb %1, %8
+	movq    %9, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+WELS_EXTERN McHorVer02WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq16_sse2
+
+ALIGN 16
+;***********************************************************************
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+;                       int16_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride
+;						int32_t iHeight
+;                       )
+;***********************************************************************
+McHorVer22Width8HorFirst_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;mov esi, [esp+16]     ;pSrc
+	;mov eax, [esp+20]	;iSrcStride
+	;mov edi, [esp+24]		;pDst
+	;mov edx, [esp+28]	;iDstStride
+	;mov ebx, [esp+32]	;iHeight
+	
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	pxor xmm7, xmm7
+
+	sub r0, r1				;;;;;;;;need more 5 lines.
+	sub r0, r1
+
+.yloop_width_8:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [r2], xmm0
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .yloop_width_8
+	LOAD_5_PARA_POP
+	ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
+;												int iHeight,
+;                      );
+;*******************************************************************************
+McHorVer20WidthEq8_sse2:
+	;push	esi
+	;push	edi
+
+	;mov esi, [esp + 12]         ;pSrc
+	;mov eax, [esp + 16]         ;iSrcStride
+	;mov edi, [esp + 20]         ;pDst
+	;mov ecx, [esp + 28]         ;iHeight
+	;mov edx, [esp + 24]			;iDstStride
+	
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	lea r0, [r0-2]            ;pSrc -= 2;
+
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	lea r2, [r2+r3]
+	lea r0, [r0+r1]
+	dec r4
+	jnz near .y_loop
+
+	LOAD_5_PARA_POP
+	ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
+;												int iHeight,
+;                      );
+;*******************************************************************************
+McHorVer20WidthEq16_sse2:
+	;push	esi
+	;push	edi
+	;mov esi, [esp + 12]         ;pSrc
+	;mov eax, [esp + 16]         ;iSrcStride
+	;mov edi, [esp + 20]         ;pDst
+	;mov ecx, [esp + 28]         ;iHeight
+	;mov edx, [esp + 24]			;iDstStride
+	
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	lea r0, [r0-2]            ;pSrc -= 2;
+
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [r2+8], xmm0
+
+	lea r2, [r2+r3]
+	lea r0, [r0+r1]
+	dec r4
+	jnz near .y_loop
+	
+	LOAD_5_PARA_POP
+	ret
+
+
+;*******************************************************************************
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
+;                       int iHeight )
+;*******************************************************************************
+ALIGN 16
+McHorVer02WidthEq8_sse2:
+	;push esi
+	;push edi
+	;mov esi, [esp + 12]           ;pSrc
+	;mov edx, [esp + 16]	          ;iSrcStride
+	;mov edi, [esp + 20]           ;pDst
+	;mov eax, [esp + 24]           ;iDstStride
+	;mov ecx, [esp + 28]           ;iHeight
+
+	%assign  push_num 0
+    LOAD_5_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+%endif
+	sub r0, r1
+	sub r0, r1
+
+	WELS_Zero xmm7
+
+	SSE_LOAD_8P xmm0, xmm7, [r0]
+	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm7, [r0]
+	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm7, [r0]
+	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+.start:
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm0, xmm1, [r0]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm3, [r0]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm5, [r0]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+	jmp near .start
+
+.xx_exit:
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20Width9Or17_sse2
+WELS_EXTERN McHorVer02Height9Or17_sse2
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
+WELS_EXTERN McHorVer22HorFirst_sse2
+
+
+;***********************************************************************
+; void McHorVer02Height9Or17_sse2(	uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;						int32_t iWidth,
+;                       int32_t iHeight )
+;***********************************************************************
+ALIGN 16
+McHorVer02Height9Or17_sse2:
+	;push esi
+	;push edi
+	;push ebx
+
+	;mov esi, [esp + 16]
+	;mov edx, [esp + 20]
+	;mov edi, [esp + 24]
+	;mov eax, [esp + 28]
+	;mov ecx, [esp + 36]
+	;mov ebx, [esp + 32]
+	
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+ 
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+    
+	shr r4, 3
+	sub r0, r1
+	sub r0, r1
+
+.xloop:
+	WELS_Zero xmm7
+	SSE_LOAD_8P xmm0, xmm7, [r0]
+	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm7, [r0]
+	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm7, [r0]
+	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	movdqa xmm0,xmm1
+	movdqa xmm1,xmm2
+	movdqa xmm2,xmm3
+	movdqa xmm3,xmm4
+	movdqa xmm4,xmm5
+	movdqa xmm5,xmm6
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm0, xmm1, [r0]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm3, [r0]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm5, [r0]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz  near .xx_exit
+	;mov esi, [esp + 16]
+	;mov edi, [esp + 24]
+	;mov ecx, [esp + 36]
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	sub r0, r1
+	sub r0, r1
+	add r0, 8
+	add r2, 8	
+	jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	LOAD_6_PARA_POP
+	ret
+
+
+ALIGN 16
+;***********************************************************************
+; void McHorVer20Width9Or17_sse2(		uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						int32_t iWidth,
+;						int32_t iHeight
+;                      );
+;***********************************************************************
+McHorVer20Width9Or17_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;mov esi, [esp+16]
+	;mov eax, [esp+20]
+	;mov edi, [esp+24]
+	;mov edx, [esp+28]
+	;mov ecx, [esp+32]
+	;mov ebx, [esp+36]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif    
+	sub r0, 2
+	pxor xmm7, xmm7
+
+	cmp r4, 9
+	jne near .width_17
+
+.yloop_width_9:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [r2], xmm0
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [r2+1], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_9
+	LOAD_6_PARA_POP
+	ret
+
+
+.width_17:
+.yloop_width_17:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movq [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [r2+8], xmm0
+
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6+8]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [r2+9], xmm2
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_17
+	LOAD_6_PARA_POP
+	ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+;							(uint8_t *pSrc,
+;							int32_t iSrcStride,
+;							uint8_t * pTap,
+;							int32_t iTapStride,
+;							int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+McHorVer22HorFirst_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;mov esi, [esp+16]
+	;mov eax, [esp+20]
+	;mov edi, [esp+24]
+	;mov edx, [esp+28]
+	;mov ecx, [esp+32]
+	;mov ebx, [esp+36]
+	
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif    
+	pxor xmm7, xmm7
+	sub r0, r1				;;;;;;;;need more 5 lines.
+	sub r0, r1
+
+	cmp r4, 9
+	jne near .width_17
+
+.yloop_width_9:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [r2], xmm0
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [r2+2], xmm2
+	movhps [r2+2+8], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_9
+	LOAD_6_PARA_POP
+	ret
+
+
+.width_17:
+.yloop_width_17:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [r2+16], xmm0
+
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6+8]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [r2+18], xmm2
+	movhps [r2+18+8], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_17
+	LOAD_6_PARA_POP
+	ret
+
+
+%macro FILTER_VER 9
+	paddw  %1, %6
+	movdqa %7, %2
+	movdqa %8, %3
+
+
+	paddw %7, %5
+	paddw %8, %4
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
+	paddw  %8, [h264_mc_hc_32]
+	psraw   %8, 6
+	packuswb %8, %8
+	movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22Width8VerLastAlign_sse2(
+;											uint8_t *pTap,
+;											int32_t iTapStride,
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastAlign_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;push ebp
+
+	;mov esi, [esp+20]
+	;mov eax, [esp+24]
+	;mov edi, [esp+28]
+	;mov edx, [esp+32]
+	;mov ebx, [esp+36]
+	;mov ecx, [esp+40]
+	
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+
+	shr r4, 3
+
+.width_loop:
+	movdqa xmm0, [r0]
+	movdqa xmm1, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqa xmm2, [r0]
+	movdqa xmm3, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqa xmm4, [r0]
+	movdqa xmm5, [r0+r1]
+
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	movdqa xmm6, [r0]
+
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm6, [r0]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm7, [r0+r1]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm0, [r0]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm1, [r0+r1]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm2, [r0]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm3, [r0+r1]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm4, [r0]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm5, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz near .exit
+	;mov esi, [esp+20]
+	;mov edi, [esp+28]
+	;mov ecx, [esp+40]
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	add r0, 16
+	add r2, 8
+	jmp .width_loop
+
+.exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	LOAD_6_PARA_POP
+	ret
+
+;***********************************************************************
+;void McHorVer22Width8VerLastUnAlign_sse2(
+;											uint8_t *pTap,
+;											int32_t iTapStride,
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastUnAlign_sse2:
+	;push esi
+	;push edi
+	;push ebx
+	;push ebp
+
+	;mov esi, [esp+20]
+	;mov eax, [esp+24]
+	;mov edi, [esp+28]
+	;mov edx, [esp+32]
+	;mov ebx, [esp+36]
+	;mov ecx, [esp+40]
+
+	%assign  push_num 0
+    LOAD_6_PARA
+%ifndef X86_32
+	movsx	r1, r1d
+	movsx	r3, r3d
+	movsx	r4, r4d
+	movsx	r5, r5d
+%endif
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+	shr r4, 3
+
+.width_loop:
+	movdqu xmm0, [r0]
+	movdqu xmm1, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqu xmm2, [r0]
+	movdqu xmm3, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqu xmm4, [r0]
+	movdqu xmm5, [r0+r1]
+
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	movdqu xmm6, [r0]
+
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm6, [r0]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm7, [r0+r1]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm0, [r0]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm1, [r0+r1]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm2, [r0]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm3, [r0+r1]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm4, [r0]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm5, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz near .exit
+	;mov esi, [esp+20]
+	;mov edi, [esp+28]
+	;mov ecx, [esp+40]
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	add r0, 16
+	add r2, 8
+	jmp .width_loop
+
+.exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	LOAD_6_PARA_POP
+	ret
\ No newline at end of file
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -6,6 +6,14 @@
 COMMON_OBJS += $(COMMON_CPP_SRCS:.cpp=.o)
 ifeq ($(USE_ASM), Yes)
 COMMON_ASM_SRCS=\
+	$(COMMON_SRCDIR)/./asm_inc.asm\
+	$(COMMON_SRCDIR)/./cpuid.asm\
+	$(COMMON_SRCDIR)/./deblock.asm\
+	$(COMMON_SRCDIR)/./expand_picture.asm\
+	$(COMMON_SRCDIR)/./mb_copy.asm\
+	$(COMMON_SRCDIR)/./mc_chroma.asm\
+	$(COMMON_SRCDIR)/./mc_luma.asm\
+	$(COMMON_SRCDIR)/./vaa.asm\
 
 COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.o)
 endif
@@ -13,6 +21,30 @@
 OBJS += $(COMMON_OBJS)
 $(COMMON_SRCDIR)/./logging.o: $(COMMON_SRCDIR)/./logging.cpp
 	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $(COMMON_SRCDIR)/./logging.o $(COMMON_SRCDIR)/./logging.cpp
+
+$(COMMON_SRCDIR)/./asm_inc.o: $(COMMON_SRCDIR)/./asm_inc.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./asm_inc.o $(COMMON_SRCDIR)/./asm_inc.asm
+
+$(COMMON_SRCDIR)/./cpuid.o: $(COMMON_SRCDIR)/./cpuid.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./cpuid.o $(COMMON_SRCDIR)/./cpuid.asm
+
+$(COMMON_SRCDIR)/./deblock.o: $(COMMON_SRCDIR)/./deblock.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./deblock.o $(COMMON_SRCDIR)/./deblock.asm
+
+$(COMMON_SRCDIR)/./expand_picture.o: $(COMMON_SRCDIR)/./expand_picture.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./expand_picture.o $(COMMON_SRCDIR)/./expand_picture.asm
+
+$(COMMON_SRCDIR)/./mb_copy.o: $(COMMON_SRCDIR)/./mb_copy.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./mb_copy.o $(COMMON_SRCDIR)/./mb_copy.asm
+
+$(COMMON_SRCDIR)/./mc_chroma.o: $(COMMON_SRCDIR)/./mc_chroma.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./mc_chroma.o $(COMMON_SRCDIR)/./mc_chroma.asm
+
+$(COMMON_SRCDIR)/./mc_luma.o: $(COMMON_SRCDIR)/./mc_luma.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./mc_luma.o $(COMMON_SRCDIR)/./mc_luma.asm
+
+$(COMMON_SRCDIR)/./vaa.o: $(COMMON_SRCDIR)/./vaa.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./vaa.o $(COMMON_SRCDIR)/./vaa.asm
 
 $(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
 	rm -f $(LIBPREFIX)common.$(LIBSUFFIX)
--- /dev/null
+++ b/codec/common/vaa.asm
@@ -1,0 +1,425 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [r0    ]	; line 0
+	movdqa %2, [r0+r1]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [r0+r2]	; line 2
+	movdqa %4, [r0+r3]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [r0    ]	; line 0
+	movdqa %2, [r0+r1]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [r0+r2]	; line 2
+	movdqa %4, [r0+r3]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+; , 6/7/2010
+
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+
+    %assign push_num 0
+    LOAD_2_PARA 
+    SIGN_EXTENTION r1,r1d
+
+%ifdef X86_32
+    push r3
+    push r4
+    push r5
+    push r6
+    %assign push_num push_num+4
+%endif
+
+    mov  r5,r7
+    and  r5,0fh
+    sub  r7,r5
+    sub  r7,32
+    
+    
+    mov r2,r1    
+    sal r2,$1   ;r2 = 2*iLineSize
+    mov r3,r2
+    add r3,r1   ;r3 = 3*iLineSize
+    
+    mov r4,r2
+    sal r4,$1   ;r4 = 4*iLineSize
+    
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7], xmm0
+
+	lea r0, [r0+r4]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7+8], xmm0
+
+	lea r0, [r0+r4]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7+16], xmm0
+
+	lea r0, [r0+r4]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7+24], xmm0
+
+	movdqa xmm0, [r7]		; block 0~7
+	movdqa xmm1, [r7+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+	
+	
+	movd r2d, xmm0
+	and r2, 0ffffh		; effective low work truncated
+	mov r3, r2
+	imul r2, r3
+	sar r2, $4
+	movd retrd, xmm1
+	sub retrd, r2d
+	
+	add r7,32
+	add r7,r5
+
+%ifdef X86_32
+	pop r6
+	pop r5
+	pop r4
+	pop r3
+%endif
+	
+	ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+
+    %assign push_num 0
+    LOAD_2_PARA 
+    SIGN_EXTENTION r1,r1d
+
+%ifdef X86_32
+    push r3
+    push r4
+    push r5
+    push r6
+    %assign push_num push_num+4
+%endif
+   
+    mov  r5,r7
+    and  r5,0fh
+    sub  r7,r5
+    sub  r7,32
+    
+
+    mov r2,r1    
+    sal r2,$1   ;r2 = 2*iLineSize
+    mov r3,r2
+    add r3,r1   ;r3 = 3*iLineSize
+    
+    mov r4,r2
+    sal r4,$1   ;r4 = 4*iLineSize
+     
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7],xmm0
+    
+	lea r0,[r0+r4]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    movq [r7+8],xmm1
+    
+    
+	lea r0,[r0+r4]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+16],xmm0
+    
+	lea r0,[r0+r4]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    movq [r7+24],xmm1
+    
+    
+	movdqa xmm0,[r7]
+	movdqa xmm1,[r7+16]
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+    
+    movd r2d, xmm0
+    and r2, 0ffffh          ; effective low work truncated
+    mov r3, r2
+    imul r2, r3
+    sar r2, $4
+    movd retrd, xmm1
+	sub retrd, r2d
+
+	add r7,32
+	add r7,r5
+%ifdef X86_32
+	pop r6
+	pop r5
+	pop r4
+	pop r3
+%endif
+	
+	ret
+
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
+;***********************************************************************
+;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;***********************************************************************
+ALIGN 16
+MdInterAnalysisVaaInfo_sse41:
+	%assign push_num 0
+	LOAD_1_PARA
+	movdqa xmm0,[r0]
+	pshufd xmm1, xmm0, 01Bh
+	paddd xmm1, xmm0
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	psrad xmm1, 02h		; iAverageSad
+	movdqa xmm2, xmm1
+	psrad xmm2, 06h
+	movdqa xmm3, xmm0	; iSadBlock
+	psrad xmm3, 06h
+	psubd xmm3, xmm2
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
+	pshufd xmm4, xmm3, 01Bh
+	paddd xmm4, xmm3
+	pshufd xmm3, xmm4, 0B1h
+	paddd xmm3, xmm4
+	movd r0d, xmm3
+	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
+	
+	jb near .threshold_exit
+	pshufd xmm0, xmm0, 01Bh
+	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
+	movmskps retrd, xmm0
+	ret
+.threshold_exit:
+	mov retrd, 15
+	ret
+
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
+;***********************************************************************
+;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;***********************************************************************
+ALIGN 16
+MdInterAnalysisVaaInfo_sse2:
+	%assign push_num 0
+	LOAD_1_PARA
+	movdqa xmm0, [r0]
+	pshufd xmm1, xmm0, 01Bh
+	paddd xmm1, xmm0
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	psrad xmm1, 02h		; iAverageSad
+	movdqa xmm2, xmm1
+	psrad xmm2, 06h
+	movdqa xmm3, xmm0	; iSadBlock
+	psrad xmm3, 06h
+	psubd xmm3, xmm2
+
+	; to replace pmulld functionality as below
+	movdqa xmm2, xmm3
+	pmuludq xmm2, xmm3
+	pshufd xmm4, xmm3, 0B1h
+	pmuludq xmm4, xmm4
+	movdqa xmm5, xmm2
+	punpckldq xmm5, xmm4
+	punpckhdq xmm2, xmm4
+	punpcklqdq xmm5, xmm2
+
+	pshufd xmm4, xmm5, 01Bh
+	paddd xmm4, xmm5
+	pshufd xmm5, xmm4, 0B1h
+	paddd xmm5, xmm4
+	
+	movd r0d, xmm5
+	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
+	jb near .threshold_exit
+	pshufd xmm0, xmm0, 01Bh
+	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
+	movmskps retrd, xmm0
+	ret
+.threshold_exit:
+	mov retrd, 15
+	ret
--- a/codec/decoder/core/asm/asm_inc.asm
+++ /dev/null
@@ -1,235 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  sse2inc.asm
-;*
-;*  Abstract
-;*      macro and constant
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
-	%define MOVDQ movdqa
-%else
-	%define MOVDQ movdqu
-%endif
-
-%if 1
-	%define WELSEMMS	emms
-%else
-	%define WELSEMMS
-%endif
-
-BITS 32
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-%macro WELS_EXTERN 1
-	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
-	%else
-		global %1
-	%endif
-%endmacro
-
-%macro WELS_AbsW 2
-	pxor        %2, %2
-    psubw       %2, %1
-    pmaxsw      %1, %2
-%endmacro
-
-%macro MMX_XSwap  4
-    movq		%4, %2
-    punpckh%1   %4, %3
-    punpckl%1   %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
-    MMX_XSwap wd, %1, %2, %5
-    MMX_XSwap wd, %3, %4, %2
-    MMX_XSwap dq, %1, %3, %4
-    MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
-    movdqa      %4, %2
-    punpckl%1   %2, %3
-    punpckh%1   %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
-    SSE2_XSawp dq,  %1, %2, %5
-    SSE2_XSawp dq,  %3, %4, %2
-    SSE2_XSawp qdq, %1, %3, %4
-    SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
-    SSE2_XSawp wd,  %1, %2, %5
-    SSE2_XSawp wd,  %3, %4, %2
-    SSE2_XSawp dq,  %1, %3, %4
-    SSE2_XSawp dq,  %5, %2, %3
-    SSE2_XSawp qdq, %1, %5, %2
-    SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in:  m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
-	movdqa	%9,	%8
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%9, %4
-	SSE2_XSawp bw,  %7, %6, %4
-
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %3
-	SSE2_XSawp wd,  %7, %4, %3
-
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %5
-	SSE2_XSawp dq,  %7, %3, %5
-
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
-    movq         %1, %4
-    punpcklbw    %1, %3
-    movq         %2, %5
-    punpcklbw    %2, %3
-    psubw        %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
-	movdqa  %3, %2
-    paddw   %2, %1
-    psubw   %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro  SSE2_Copy8Times 2
-		movd	%1, %2
-		punpcklwd %1, %1
-		pshufd	%1,	%1,	0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro  SSE2_Copy16Times 2
-		movd		%1, %2
-		pshuflw		%1, %1, 0
-		punpcklqdq	%1, %1
-		packuswb	%1,	%1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro	WELS_Zero 1
-	pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
-	pcmpeqw %1,%1
-	psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -42,263 +42,7 @@
 
 %include  "asm_inc.asm"
 
-BITS 32
-
 ;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%macro   BLOCK_ADD_16_SSE2   4
-	movdqa    xmm0,       [%2]
-	movdqa    xmm1,       [%3]
-    movdqa    xmm2,       [%3+10h]
-	movdqa    xmm6,       xmm0
-
-	punpcklbw    xmm0,    xmm7
-	punpckhbw    xmm6,    xmm7
-
-	paddw        xmm0,    xmm1
-	paddw        xmm6,    xmm2
-
-	packuswb     xmm0,    xmm6
-	movdqa       [%1],    xmm0
-
-	lea          %2,      [%2+%4]
-	lea          %3,      [%3+%4*2]
-	lea          %1,      [%1+%4]
-%endmacro
-
-%macro    BLOCK_ADD_8_MMXEXT   4
-    movq       mm0,       [%2]
-	movq       mm1,       [%3]
-	movq       mm2,       [%3+08h]
-	movq       mm6,       mm0
-
-	punpcklbw    mm0,     mm7
-	punpckhbw    mm6,     mm7
-
-	paddw        mm0,     mm1
-	paddw        mm6,     mm2
-
-	packuswb     mm0,     mm6
-	movq         [%1],    mm0
-
-	lea          %2,      [%2+%4]
-	lea          %3,      [%3+%4*2]
-	lea          %1,      [%1+%4]
-%endmacro
-
-
-%macro    BLOCK_ADD_16_STRIDE_SSE2  5
-    movdqa    xmm0,       [%2]
-	movdqa    xmm1,       [%3]
-    movdqa    xmm2,       [%3+10h]
-	movdqa    xmm6,       xmm0
-
-	punpcklbw    xmm0,    xmm7
-	punpckhbw    xmm6,    xmm7
-
-	paddw        xmm0,    xmm1
-	paddw        xmm6,    xmm2
-
-	packuswb     xmm0,    xmm6
-	movdqa       [%1],    xmm0
-
-	lea          %2,      [%2+%4]
-	lea          %3,      [%3+%5*2]
-	lea          %1,      [%1+%4]
-%endmacro
-
-
-%macro    BLOCK_ADD_8_STRIDE_MMXEXT   5
-    movq       mm0,       [%2]
-	movq       mm1,       [%3]
-	movq       mm2,       [%3+08h]
-	movq       mm6,       mm0
-
-	punpcklbw    mm0,     mm7
-	punpckhbw    mm6,     mm7
-
-	paddw        mm0,     mm1
-	paddw        mm6,     mm2
-
-	packuswb     mm0,     mm6
-	movq         [%1],    mm0
-
-	lea          %2,      [%2+%4]
-	lea          %3,      [%3+%5*2]
-	lea          %1,      [%1+%4]
-%endmacro
-
-%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5
-	movdqa xmm1, [%3]
-	movq xmm0, [%2]
-	punpcklbw xmm0, xmm7
-	paddw xmm0, xmm1
-	packuswb xmm0, xmm7
-	movq [%1], xmm0
-
-	movdqa xmm3, [%3+%5*2]
-	movq xmm2, [%2+%4]
-	punpcklbw xmm2, xmm7
-	paddw xmm2, xmm3
-	packuswb xmm2, xmm7
-	movq [%1+%4], xmm2
-
-	lea %1, [%1+%4*2]
-	lea %2, [%2+%4*2]
-	lea %3, [%3+%5*4]
-%endmacro
-
-%macro   CHECK_DATA_16_ZERO_SSE4     3
-    mov        eax,      0h
-	movdqa     xmm0,     [%1]
-	movdqa     xmm1,     [%1+10h]
-	mov        ebx,       [ecx]
-
-	por		   xmm0,	 xmm1
-	ptest      xmm7,     xmm0
-	cmovae     eax,      %3
-
-	add        %1,       20h
-	add        ecx,      04h
-	mov        byte [%2+ebx],  al
-%endmacro
-
-%macro  CHECK_RS_4x4_BLOCK_2_ZERO_SSE4   5
-    movdqa     xmm0,      [%1]
-    movdqa     xmm1,      [%1+%3]
-    movdqa     xmm2,      [%1+%3*2]
-    movdqa     xmm3,      [%1+%4]
-
-    mov        eax,       0h
-    mov        ebx,       0h
-    movdqa     xmm4,      xmm0
-    movdqa     xmm5,      xmm2
-
-    punpcklqdq  xmm0,     xmm1
-    punpckhqdq  xmm4,     xmm1
-    punpcklqdq  xmm2,     xmm3
-    punpckhqdq  xmm5,     xmm3
-
-	por			xmm0,	  xmm2
-	por			xmm4,	  xmm5
-
-    ptest       xmm7,     xmm0
-    cmovae      eax,      %5
-    ptest       xmm7,     xmm4
-    cmovae      ebx,      %5
-
-    mov     byte [%2],    al
-    mov     byte [%2+1],  bl
-%endmacro
-
-%macro   DATA_COPY_16x2_SSE2      3
-    movdqa     xmm0,    [%1]
-	movdqa     xmm1,    [%1+10h]
-	movdqa     xmm2,    [%1+%3]
-	movdqa     xmm3,    [%1+%3+10h]
-
-	movdqa     [%2],    xmm0
-	movdqa     [%2+10h],  xmm1
-	movdqa     [%2+20h],  xmm2
-	movdqa     [%2+30h],  xmm3
-
-	lea        %1,      [%1+%3*2]
-	lea        %2,      [%2+40h]
-%endmacro
-
-
-%macro   DATA_COPY_8x4_SSE2      4
-    movdqa     xmm0,         [%1]
-	movdqa     xmm1,         [%1+%3]
-	movdqa     xmm2,         [%1+%3*2]
-	movdqa     xmm3,         [%1+%4]
-
-	movdqa     [%2],         xmm0
-	movdqa     [%2+10h],     xmm1
-	movdqa     [%2+20h],     xmm2
-	movdqa     [%2+30h],     xmm3
-
-	lea        %1,           [%1+%3*4]
-	lea        %2,           [%2+40h]
-%endmacro
-
-
-%macro   CHECK_DATA_16_ZERO_SSE2   3
-    mov        eax,       0h
-    movdqa     xmm0,      [%1]
-    movdqa     xmm1,      [%1+10h]
-    mov        ebx,       [ecx]
-
-    pcmpeqw    xmm0,      xmm7
-    pcmpeqw    xmm1,      xmm7
-    packsswb   xmm0,      xmm1
-    pmovmskb   edx,       xmm0
-    sub        edx,       0ffffh
-
-    cmovb      eax,       ebp
-    add        ecx,       4
-    add        %1,        20h
-    mov      byte [%2+ebx],    al
-%endmacro
-
-
-
-%macro   CHECK_RS_4x4_BLOCK_2_ZERO_SSE2    5
-    movdqa    xmm0,      [%1]
-    movdqa    xmm1,      [%1 + %3]
-    movdqa    xmm2,      [%1 + %3*2]
-    movdqa    xmm3,      [%1 + %4]
-
-    movdqa    xmm4,       xmm0
-    movdqa    xmm5,       xmm2
-
-    punpcklqdq   xmm0,    xmm1
-    punpckhqdq   xmm4,    xmm1
-    punpcklqdq   xmm2,    xmm3
-    punpckhqdq   xmm5,    xmm3
-
-    pcmpeqw      xmm0,    xmm7
-    pcmpeqw      xmm2,    xmm7
-    pcmpeqw      xmm4,    xmm7
-    pcmpeqw      xmm5,    xmm7
-
-    packsswb     xmm0,    xmm2
-    packsswb     xmm4,    xmm5
-    pmovmskb     eax,     xmm0
-    pmovmskb     ebx,     xmm4
-
-    sub          eax,     0ffffh
-    mov          eax,     0
-    cmovb        eax,     %5
-    sub          ebx,     0ffffh
-    mov          ebx,     0
-    cmovb        ebx,     %5
-    mov       byte [%2],    al
-    mov       byte [%2+1],  bl
-%endmacro
-
-;*******************************************************************************
-; Data
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-ALIGN  16
-SubMbScanIdx:
-     dd    0x0,  0x1,  0x4,  0x5,
-	 dd    0x2,  0x3,  0x6,  0x7,
-	 dd    0x8,  0x9,  0xc,  0xd,
-	 dd    0xa,  0xb,  0xe,  0xf,
-	 dd    0x10, 0x11, 0x14, 0x15,
-	 dd    0x12, 0x13, 0x16, 0x17,
-
-;*******************************************************************************
 ; Code
 ;*******************************************************************************
 
@@ -312,71 +56,77 @@
 ;  void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
 ;*******************************************************************************
 WelsResBlockZero16x16_sse2:
-    push     esi
+        ;push     r0
+        %assign push_num 0
+        LOAD_2_PARA
+		%ifndef X86_32
+		movsx r1, r1d
+		%endif
+	;mov      r0,        [esp+08h]
+	;mov      r1,        [esp+0ch]
+	;lea      r1,        [r1*2]
+        lea 	r1, 	[r1*2]
+	;lea      r2,        [r1*3]
+        lea 	r2,	[r1*3]
 
-	mov      esi,        [esp+08h]
-	mov      ecx,        [esp+0ch]
-	lea      ecx,        [ecx*2]
-	lea      eax,        [ecx*3]
-
 	pxor     xmm7,       xmm7
 
     ; four  lines
-	movdqa   [esi],      xmm7
-	movdqa   [esi+10h],  xmm7
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
 
-	movdqa   [esi+ecx],  xmm7
-	movdqa   [esi+ecx+10h],     xmm7
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
 
-    movdqa   [esi+ecx*2],   xmm7
-	movdqa   [esi+ecx*2+10h],   xmm7
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
 
-	movdqa   [esi+eax],     xmm7
-	movdqa   [esi+eax+10h],     xmm7
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
 
     ;  four lines
-	lea      esi,       [esi+ecx*4]
-	movdqa   [esi],      xmm7
-	movdqa   [esi+10h],  xmm7
+	lea      r0,       [r0+r1*4]
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
 
-	movdqa   [esi+ecx],  xmm7
-	movdqa   [esi+ecx+10h],     xmm7
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
 
-    movdqa   [esi+ecx*2],   xmm7
-	movdqa   [esi+ecx*2+10h],   xmm7
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
 
-	movdqa   [esi+eax],     xmm7
-	movdqa   [esi+eax+10h],     xmm7
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
 
 	;  four lines
-	lea      esi,       [esi+ecx*4]
-	movdqa   [esi],      xmm7
-	movdqa   [esi+10h],  xmm7
+	lea      r0,       [r0+r1*4]
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
 
-	movdqa   [esi+ecx],  xmm7
-	movdqa   [esi+ecx+10h],     xmm7
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
 
-    movdqa   [esi+ecx*2],   xmm7
-	movdqa   [esi+ecx*2+10h],   xmm7
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
 
-	movdqa   [esi+eax],     xmm7
-	movdqa   [esi+eax+10h],     xmm7
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
 
 	;  four lines
-	lea      esi,       [esi+ecx*4]
-	movdqa   [esi],      xmm7
-	movdqa   [esi+10h],  xmm7
+	lea      r0,       [r0+r1*4]
+	movdqa   [r0],      xmm7
+	movdqa   [r0+10h],  xmm7
 
-	movdqa   [esi+ecx],  xmm7
-	movdqa   [esi+ecx+10h],     xmm7
+	movdqa   [r0+r1],  xmm7
+	movdqa   [r0+r1+10h],     xmm7
 
-    movdqa   [esi+ecx*2],   xmm7
-	movdqa   [esi+ecx*2+10h],   xmm7
+    movdqa   [r0+r1*2],   xmm7
+	movdqa   [r0+r1*2+10h],   xmm7
 
-	movdqa   [esi+eax],     xmm7
-	movdqa   [esi+eax+10h],     xmm7
+	movdqa   [r0+r2],     xmm7
+	movdqa   [r0+r2+10h],     xmm7
 
-    pop      esi
+    ;pop      r0
 	ret
 
 
@@ -387,27 +137,31 @@
 ;  void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
 ;*******************************************************************************
 WelsResBlockZero8x8_sse2:
-	  push      esi
+	  ;push      r0
+	  %assign push_num 0
+          LOAD_2_PARA
+		  %ifndef X86_32
+		  movsx r1, r1d
+		  %endif
+      	  ;mov       r0,     [esp+08h]
+	  ;mov       r1,     [esp+0ch]
+	  lea       r1,     [r1*2]
+	  lea       r2,     [r1*3]
 
-      mov       esi,     [esp+08h]
-	  mov       ecx,     [esp+0ch]
-	  lea       ecx,     [ecx*2]
-	  lea       eax,     [ecx*3]
-
 	  pxor      xmm7,          xmm7
 
-	  movdqa    [esi],         xmm7
-	  movdqa    [esi+ecx],     xmm7
-	  movdqa    [esi+ecx*2],   xmm7
-	  movdqa    [esi+eax],     xmm7
+	  movdqa    [r0],         xmm7
+	  movdqa    [r0+r1],     xmm7
+	  movdqa    [r0+r1*2],   xmm7
+	  movdqa    [r0+r2],     xmm7
 
-	  lea       esi,     [esi+ecx*4]
-	  movdqa    [esi],         xmm7
-	  movdqa    [esi+ecx],     xmm7
-	  movdqa    [esi+ecx*2],   xmm7
-	  movdqa    [esi+eax],     xmm7
+	  lea       r0,     [r0+r1*4]
+	  movdqa    [r0],         xmm7
+	  movdqa    [r0+r1],     xmm7
+	  movdqa    [r0+r1*2],   xmm7
+	  movdqa    [r0+r2],     xmm7
 
 
-	  pop       esi
+	  ;pop       r0
 	  ret
 
--- a/codec/decoder/core/asm/cpuid.asm
+++ /dev/null
@@ -1,169 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	cpu_mmx.asm
-;*
-;*  Abstract
-;*		verify cpuid feature support and cpuid detection
-;*
-;*  History
-;*      04/29/2009	Created
-;*
-;*************************************************************************/
-
-bits 32
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-%macro WELS_EXTERN 1
-	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
-	%else
-		global %1
-	%endif
-%endmacro
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
-;******************************************************************************************
-;   int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WelsCPUIdVerify:
-    pushfd					; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
-	pushfd					; need push 2 EFLAGS, one for processing and the another one for storing purpose
-    pop     ecx				; get EFLAGS to bit manipulation
-    mov     eax, ecx		; store into ecx followed
-    xor     eax, 00200000h	; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
-	xor		eax, ecx		; get the ID flag bitwise, eax - 0: not support; otherwise: support
-    popfd					; store back EFLAGS and keep unchanged for system
-    ret
-
-WELS_EXTERN WelsCPUId
-ALIGN 16
-;****************************************************************************************************
-;   void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
-;****************************************************************************************************
-WelsCPUId:
-	push	ebx
-	push	edi
-
-	mov     eax, [esp+12]	; operating index
-    cpuid					; cpuid
-
-	; processing various information return
-	mov     edi, [esp+16]
-    mov     [edi], eax
-    mov     edi, [esp+20]
-    mov     [edi], ebx
-    mov     edi, [esp+24]
-    mov     [edi], ecx
-    mov     edi, [esp+28]
-    mov     [edi], edx
-
-	pop		edi
-    pop     ebx
-	ret
-
-WELS_EXTERN WelsCPUSupportAVX
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportAVX:
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-
-	; refer to detection of AVX addressed in INTEL AVX manual document
-	and ecx, 018000000H
-	cmp ecx, 018000000H		; check both OSXSAVE and AVX feature flags
-	jne avx_not_supported
-	; processor supports AVX instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne avx_not_supported
-	mov eax, 1
-	ret
-avx_not_supported:
-	mov eax, 0
-	ret
-
-WELS_EXTERN WelsCPUSupportFMA
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportFMA:
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-
-	; refer to detection of FMA addressed in INTEL AVX manual document
-	and ecx, 018001000H
-	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
-	jne fma_not_supported
-	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne fma_not_supported
-	mov eax, 1
-	ret
-fma_not_supported:
-	mov eax, 0
-	ret
-
-WELS_EXTERN WelsEmms
-ALIGN 16
-;******************************************************************************************
-;   void WelsEmms()
-;******************************************************************************************
-WelsEmms:
-	emms	; empty mmx technology states
-	ret
-
-
-
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -42,8 +42,6 @@
 
 %include "asm_inc.asm"
 
-BITS 32
-
 ;*******************************************************************************
 ; Macros and other preprocessor constants
 ;*******************************************************************************
@@ -93,20 +91,16 @@
 ;*******************************************************************************
 
 IdctResAddPred_mmx:
+    %assign push_num 0
+    LOAD_3_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+    movq    mm0, [r2+ 0]
+    movq    mm1, [r2+ 8]
+    movq    mm2, [r2+16]
+    movq    mm3, [r2+24]
 
-%define	pushsize	0
-%define pPred       esp+pushsize+4
-%define kiStride     esp+pushsize+8
-%define pRs         esp+pushsize+12
-
-	mov     eax, [pRs   ]
-    mov     edx, [pPred ]
-    mov     ecx, [kiStride]
-    movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
-
 	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
 	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
     MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
@@ -115,15 +109,12 @@
     WELS_Zero			mm7
     WELS_DW32			mm6
 
-    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
-    lea     edx, [edx+2*ecx]
-    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [r0]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [r0+r1]
+    lea     r0, [r0+2*r1]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [r0]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [r0+r1]
 
-%undef	pushsize
-%undef  pPred
-%undef  kiStride
-%undef  pRs
+
 	emms
     ret
--- a/codec/decoder/core/asm/deblock.asm
+++ /dev/null
@@ -1,2113 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,68h
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx]
-  movq        xmm5,[edx+ecx]
-  push        esi
-  push        edi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  movq        xmm1,[edi]
-  mov         edi,ecx
-  sub         edi,esi
-  movq        xmm2,[edi]
-  punpcklqdq  xmm1,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm2,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm3,[edi]
-  punpcklqdq  xmm2,xmm3
-  movq        xmm3,[eax]
-  punpcklqdq  xmm3,xmm4
-  movq        xmm4,[edx+eax]
-  mov       edx, [ebp + 14h]
-  punpcklqdq  xmm4,xmm5
-  movd        xmm5,edx
-  mov       edx, [ebp + 18h]
-  pxor        xmm0,xmm0
-  movdqa      xmm6,xmm5
-  punpcklwd   xmm6,xmm5
-  pshufd      xmm5,xmm6,0
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,xmm1
-  punpckhbw   xmm1,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+40h],xmm1
-  movdqa      [esp+60h],xmm7
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+10h],xmm7
-  movdqa      xmm7,xmm3
-  punpcklbw   xmm7,xmm0
-  punpckhbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm7,xmm4
-  punpckhbw   xmm4,xmm0
-  punpckhbw   xmm2,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+30h],xmm3
-  movdqa      xmm3,[esp+10h]
-  movdqa      xmm1,xmm3
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      [esp+20h],xmm4
-  movdqa      xmm0,xmm5
-  pcmpgtw     xmm0,xmm1
-  movdqa      xmm1,[esp+60h]
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  pand        xmm0,xmm4
-  movdqa      xmm1,xmm7
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,xmm2
-  psubw       xmm1,[esp+30h]
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  pand        xmm0,xmm4
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,[esp+20h]
-  psubw       xmm1,[esp+30h]
-  pand        xmm5,xmm4
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  pand        xmm5,xmm6
-  mov         edx,2
-  movsx       edx,dx
-  movd        xmm1,edx
-  movdqa      xmm4,xmm1
-  punpcklwd   xmm4,xmm1
-  pshufd      xmm1,xmm4,0
-  movdqa      xmm4,[esp+60h]
-  movdqa      xmm6,xmm4
-  paddw       xmm6,xmm4
-  paddw       xmm6,xmm3
-  paddw       xmm6,xmm7
-  movdqa      [esp+10h],xmm1
-  paddw       xmm6,[esp+10h]
-  psraw       xmm6,2
-  movdqa      xmm4,xmm0
-  pandn       xmm4,xmm3
-  movdqa      xmm3,[esp+40h]
-  movdqa      xmm1,xmm0
-  pand        xmm1,xmm6
-  por         xmm1,xmm4
-  movdqa      xmm6,xmm3
-  paddw       xmm6,xmm3
-  movdqa      xmm3,[esp+10h]
-  paddw       xmm6,xmm2
-  paddw       xmm6,[esp+20h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm4,xmm5
-  pand        xmm4,xmm6
-  movdqa      xmm6,xmm5
-  pandn       xmm6,xmm2
-  por         xmm4,xmm6
-  packuswb    xmm1,xmm4
-  movdqa      xmm4,[esp+50h]
-  movdqa      xmm6,xmm7
-  paddw       xmm6,xmm7
-  paddw       xmm6,xmm4
-  paddw       xmm6,[esp+60h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm2,xmm0
-  pand        xmm2,xmm6
-  pandn       xmm0,xmm4
-  por         xmm2,xmm0
-  movdqa      xmm0,[esp+20h]
-  movdqa      xmm6,xmm0
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[esp+30h]
-  paddw       xmm6,xmm0
-  paddw       xmm6,[esp+40h]
-  movdqa      xmm4,xmm5
-  paddw       xmm6,xmm3
-  movq        [esi],xmm1
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  pandn       xmm5,xmm0
-  por         xmm4,xmm5
-  packuswb    xmm2,xmm4
-  movq        [eax],xmm2
-  psrldq      xmm1,8
-  movq        [edi],xmm1
-  pop         edi
-  psrldq      xmm2,8
-  movq        [ecx],xmm2
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0E4h
-  push        ebx
-  push        esi
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2]
-  push        edi
-  movsx       di,byte [esi+3]
-  mov         word [esp+0Ch],bx
-  movsx       bx,byte  [esi+1]
-  movsx       esi,byte  [esi]
-  mov         word  [esp+0Eh],si
-  movzx       esi,di
-  movd        xmm1,esi
-  movzx       esi,di
-  movd        xmm2,esi
-  mov         si,word  [esp+0Ch]
-  mov         edx, [ebp + 10h]
-  mov         eax, [ebp + 08h]
-  movzx       edi,si
-  movzx       esi,si
-  mov         ecx, [ebp + 0Ch]
-  movd        xmm4,esi
-  movzx       esi,bx
-  movd        xmm5,esi
-  movd        xmm3,edi
-  movzx       esi,bx
-  movd        xmm6,esi
-  mov         si,word [esp+0Eh]
-  movzx       edi,si
-  movzx       esi,si
-  punpcklwd   xmm6,xmm2
-  pxor        xmm0,xmm0
-  movdqa      [esp+40h],xmm0
-  movd        xmm7,edi
-  movd        xmm0,esi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  punpcklwd   xmm0,xmm4
-  movq        xmm4,[edx+ecx]
-  punpcklwd   xmm7,xmm3
-  movq        xmm3,[eax]
-  punpcklwd   xmm0,xmm6
-  movq        xmm6,[edi]
-  punpcklwd   xmm7,xmm5
-  punpcklwd   xmm0,xmm7
-  mov         edi,ecx
-  sub         edi,esi
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+60h],xmm2
-  movq        xmm2, [edi]
-  punpcklqdq  xmm6,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm7,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm2,[edi]
-  punpcklqdq  xmm7,xmm2
-  movq        xmm2,[ecx]
-  punpcklqdq  xmm3,xmm2
-  movq        xmm2,[edx+eax]
-  movsx       edx,word [ebp + 14h]
-  punpcklqdq  xmm2,xmm4
-  movdqa      [esp+0E0h],xmm2
-  movd        xmm2,edx
-  movsx       edx,word [ebp + 18h]
-  movdqa      xmm4,xmm2
-  punpcklwd   xmm4,xmm2
-  movd        xmm2,edx
-  movdqa      xmm5,xmm2
-  punpcklwd   xmm5,xmm2
-  pshufd      xmm2,xmm5,0
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  movdqa      [esp+0D0h],xmm3
-  pshufd      xmm4,xmm4,0
-  movdqa      [esp+30h],xmm2
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+80h],xmm6
-  movdqa      xmm6,[esp+0D0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+70h],xmm6
-  movdqa      xmm6, [esp+0E0h]
-  punpckhbw   xmm6,xmm1
-  movdqa     [esp+90h],xmm6
-  movdqa      xmm5, [esp+0E0h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa       [esp+0A0h],xmm7
-  punpcklbw   xmm3,xmm1
-  mov         edx,4
-  punpcklbw   xmm2,xmm1
-  movsx       edx,dx
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,[esp+30h]
-  movdqa      [esp+20h],xmm6
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1,[esp+60h]
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6, [esp+20h]
-  movdqa      xmm7, [esp+50h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      [esp+10h],xmm0
-  movdqa      xmm6, [esp+10h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+10h],xmm6
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm6,xmm4
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+30h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1,[esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5,[esp+80h]
-  psubw       xmm5,[esp+90h]
-  pand        xmm6,xmm1
-  pand        xmm6,[esp+40h]
-  movdqa      xmm1,[esp+10h]
-  pand        xmm1,xmm6
-  movdqa      xmm6,[esp+70h]
-  movdqa      [esp+30h],xmm1
-  movdqa      xmm1,[esp+0A0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6,[esp+20h]
-  movdqa      xmm5,[esp+60h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+70h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+80h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+90h]
-  pand        xmm4,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+40h]
-  pand        xmm0,xmm4
-  movdqa      xmm4,[esp+30h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  packuswb    xmm2,xmm1
-  movq        [esi],xmm2
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm3,xmm5
-  movq        [eax],xmm3
-  psrldq      xmm2,8
-  movq        [edi],xmm2
-  pop         edi
-  pop         esi
-  psrldq      xmm3,8
-  movq        [ecx],xmm3
-  pop         ebx
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-
-DeblockChromaEq4H_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0C8h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+7Ch]
-  push        edi
-  mov         dword [esp+14h],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+0Ch],edx
-  mov         dword [esp+10h],eax
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword  [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+0Ch]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+10h]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  movsx       ecx,word [ebp+14h]
-  movsx       edx,word [ebp+18h]
-  movdqa      xmm6,[esp+80h]
-  movdqa      xmm4,[esp+90h]
-  movdqa      xmm5,[esp+0A0h]
-  movdqa      xmm7,[esp+0B0h]
-  pxor        xmm0,xmm0
-  movd        xmm1,ecx
-  movdqa      xmm2,xmm1
-  punpcklwd   xmm2,xmm1
-  pshufd      xmm1,xmm2,0
-  movd        xmm2,edx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3,xmm6
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm6,[esp+90h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm6,[esp+0A0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,[esp+0B0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+70h],xmm6
-  punpcklbw   xmm7,xmm0
-  punpcklbw   xmm4,xmm0
-  punpcklbw   xmm5,xmm0
-  punpcklbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm6,xmm4
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  movdqa      xmm0,xmm1
-  pcmpgtw     xmm0,xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm4
-  pabsw       xmm6,xmm6
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+30h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm1,xmm6
-  movdqa      xmm6,[esp+60h]
-  psubw       xmm6,[esp+30h]
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+70h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pand        xmm1,xmm7
-  pcmpgtw     xmm2,xmm6
-  pand        xmm1,xmm2
-  mov         eax,2
-  movsx       ecx,ax
-  movd        xmm2,ecx
-  movdqa      xmm6,xmm2
-  punpcklwd   xmm6,xmm2
-  pshufd      xmm2,xmm6,0
-  movdqa      [esp+20h],xmm2
-  movdqa      xmm2,xmm3
-  paddw       xmm2,xmm3
-  paddw       xmm2,xmm4
-  paddw       xmm2,[esp+50h]
-  paddw       xmm2,[esp+20h]
-  psraw       xmm2,2
-  movdqa      xmm6,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm2,xmm0
-  pandn       xmm2,xmm4
-  por         xmm6,xmm2
-  movdqa      xmm2,[esp+60h]
-  movdqa      xmm7,xmm2
-  paddw       xmm7,xmm2
-  paddw       xmm7,[esp+30h]
-  paddw       xmm7,[esp+70h]
-  paddw       xmm7,[esp+20h]
-  movdqa      xmm4,xmm1
-  movdqa      xmm2,xmm1
-  pandn       xmm2,[esp+30h]
-  psraw       xmm7,2
-  pand        xmm4,xmm7
-  por         xmm4,xmm2
-  movdqa      xmm2,[esp+50h]
-  packuswb    xmm6,xmm4
-  movdqa      [esp+90h],xmm6
-  movdqa      xmm6,xmm2
-  paddw       xmm6,xmm2
-  movdqa      xmm2,[esp+20h]
-  paddw       xmm6,xmm5
-  paddw       xmm6,xmm3
-  movdqa      xmm4,xmm0
-  pandn       xmm0,xmm5
-  paddw       xmm6,xmm2
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  por         xmm4,xmm0
-  movdqa      xmm0,[esp+70h]
-  movdqa      xmm5,xmm0
-  paddw       xmm5,xmm0
-  movdqa      xmm0,[esp+40h]
-  paddw       xmm5,xmm0
-  paddw       xmm5,[esp+60h]
-  movdqa      xmm3,xmm1
-  paddw       xmm5,xmm2
-  psraw       xmm5,2
-  pand        xmm3,xmm5
-  pandn       xmm1,xmm0
-  por         xmm3,xmm1
-  packuswb    xmm4,xmm3
-  movdqa      [esp+0A0h],xmm4
-  mov         esi,dword [esp+10h]
-  movdqa      xmm0,[esi]
-  movdqa      xmm1,[esi+10h]
-  movdqa      xmm2,[esi+20h]
-  movdqa      xmm3,[esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+0Ch]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4H_sse2
-
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,108h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+10h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+6Ch]
-  push        edi
-  mov         dword [esp+0Ch],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+10h],edx
-  mov         dword [esp+1Ch],eax
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+10h]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+1Ch]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  mov         eax,dword [ebp+1Ch]
-  movsx       cx,byte [eax+3]
-  movsx       dx,byte [eax+2]
-  movsx       si,byte [eax+1]
-  movsx       ax,byte [eax]
-  movzx       edi,cx
-  movzx       ecx,cx
-  movd        xmm2,ecx
-  movzx       ecx,dx
-  movzx       edx,dx
-  movd        xmm3,ecx
-  movd        xmm4,edx
-  movzx       ecx,si
-  movzx       edx,si
-  movd        xmm5,ecx
-  pxor        xmm0,xmm0
-  movd        xmm6,edx
-  movzx       ecx,ax
-  movdqa      [esp+60h],xmm0
-  movzx       edx,ax
-  movsx       eax,word [ebp+14h]
-  punpcklwd   xmm6,xmm2
-  movd        xmm1,edi
-  movd        xmm7,ecx
-  movsx       ecx,word [ebp+18h]
-  movd        xmm0,edx
-  punpcklwd   xmm7,xmm3
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+60h]
-  punpcklwd   xmm7,xmm5
-  movdqa      xmm5,[esp+0A0h]
-  punpcklwd   xmm0,xmm4
-  punpcklwd   xmm0,xmm6
-  movdqa      xmm6, [esp+70h]
-  punpcklwd   xmm0,xmm7
-  movdqa      xmm7,[esp+80h]
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+0D0h],xmm2
-  movd        xmm2,eax
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm4,xmm3,0
-  movd        xmm2,ecx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3, [esp+90h]
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+40h],xmm2
-  movdqa      [esp+0B0h],xmm6
-  movdqa      xmm6,[esp+90h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm2,xmm1
-  punpcklbw   xmm3,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa      [esp+0F0h],xmm7
-  movdqa      [esp+0C0h],xmm6
-  movdqa      xmm6, [esp+0A0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+0E0h],xmm6
-  mov         edx,4
-  movsx       eax,dx
-  movd        xmm6,eax
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm7, [esp+40h]
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm1, [esp+0D0h]
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6,[esp+30h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      xmm7,[esp+50h]
-  movdqa      [esp+20h],xmm0
-  movdqa      xmm6, [esp+20h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+20h],xmm6
-  movdqa      xmm6,xmm4
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+40h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1, [esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5, [esp+0B0h]
-  psubw       xmm5,[esp+0E0h]
-  pand        xmm6,xmm1
-  pand        xmm6, [esp+60h]
-  movdqa      xmm1, [esp+20h]
-  pand        xmm1,xmm6
-  movdqa      xmm6, [esp+0C0h]
-  movdqa      [esp+40h],xmm1
-  movdqa      xmm1, [esp+0F0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6, [esp+30h]
-  movdqa      xmm5, [esp+0D0h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+0C0h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+0B0h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6, [esp+0E0h]
-  pand        xmm4,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+60h]
-  pand        xmm0,xmm4
-  movdqa      xmm4, [esp+40h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm2,xmm1
-  packuswb    xmm3,xmm5
-  movdqa      [esp+80h],xmm2
-  movdqa      [esp+90h],xmm3
-  mov         esi,dword [esp+1Ch]
-  movdqa      xmm0, [esi]
-  movdqa      xmm1, [esi+10h]
-  movdqa      xmm2, [esi+20h]
-  movdqa      xmm3, [esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+10h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-
-
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h
-
-    mov     eax,   [ebp + 0Ch]
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-
-    movq    xmm0,  [eax]
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2]
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax]
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2]
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0
-
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-
-
-
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h
-
-    mov      eax,   [ebp + 10h]
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
-    lea      eax,   [ecx * 3]
-
-    movq     [edx],  xmm4
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0
-
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0
-
-
-    mov      esp,   ebp
-    pop      ebp
-    ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ /dev/null
@@ -1,655 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  expand_picture.asm
-;*
-;*  Abstract
-;*      mmxext/sse for expand_frame
-;*
-;*  History
-;*      09/25/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-;%define PADDING_SIZE_ASM 	32 	; PADDING_LENGTH
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-
-SECTION .text
-
-;WELS_EXTERN expand_picture_luma_mmx
-;WELS_EXTERN expand_picture_chroma_mmx
-WELS_EXTERN ExpandPictureLuma_sse2
-WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
-WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
-
-;;;;;;;expanding result;;;;;;;
-
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;----------------------------
-;aaaa|attttttttttttttttb|bbbb
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;----------------------------
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-
-%macro mov_line_8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
-	; ebx [width/16(8)]
-	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
-	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-
-%if %1 == 32		; for luma
-	sar ebx, 04h 	; width / 16(8) pixels
-.top_bottom_loops:
-	; top
-	movdqa xmm0, [esi]		; first line of picture pData
-	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
-	; bottom
-	movdqa xmm1, [eax] 		; last line of picture pData
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
-	lea esi, [esi+16]		; top pSrc
-	lea edi, [edi+16]		; top dst
-	lea eax, [eax+16]		; bottom pSrc
-	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?
-
-	dec ebx
-	jnz near .top_bottom_loops
-%elif %1 == 16	; for chroma ??
-	mov edx, ebx
-	sar ebx, 04h 	; (width / 16) pixels
-.top_bottom_loops:
-	; top
-	movdqa xmm0, [esi]		; first line of picture pData
-	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
-	; bottom
-	movdqa xmm1, [eax] 		; last line of picture pData
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
-	lea esi, [esi+16]		; top pSrc
-	lea edi, [edi+16]		; top dst
-	lea eax, [eax+16]		; bottom pSrc
-	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?
-
-	dec ebx
-	jnz near .top_bottom_loops
-
-	; for remaining 8 bytes
-	and edx, 0fh		; any 8 bytes left?
-	test edx, edx
-	jz near .to_be_continued	; no left to exit here
-
-	; top
-	movq mm0, [esi]		; remained 8 byte
-	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	mov_line_end8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	; bottom
-	movq mm1, [eax]
-	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	mov_line_end8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	WELSEMMS
-
-.to_be_continued:
-%endif
-%endmacro
-
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; ecx [height]
-	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
-	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
-;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32		; for luma
-.left_right_loops:
-	; left
-	mov al, byte [esi]		; pixel pData for left border
-	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0
-	movdqa [edi+16], xmm0
-
-	; right
-	mov al, byte [ebx]
-	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [ebp], xmm1
-	movdqa [ebp+16], xmm1
-
-	lea esi, [esi+edx]		; left pSrc
-	lea edi, [edi+edx]		; left dst
-	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst
-
-	dec ecx
-	jnz near .left_right_loops
-%elif %1 == 16	; for chroma ??
-.left_right_loops:
-	; left
-	mov al, byte [esi]		; pixel pData for left border
-	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0
-
-	; right
-	mov al, byte [ebx]
-	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-
-	lea esi, [esi+edx]		; left pSrc
-	lea edi, [edi+edx]		; left dst
-	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst
-
-	dec ecx
-	jnz near .left_right_loops
-%endif
-%endmacro
-
-%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
-	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
-%if %1 == 32		; luma
-	; TL
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_end32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-
-	; TR
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_end32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-
-	; BL
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_end32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-
-	; BR
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_end32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-%elif %1 == 16	; chroma
-	; TL
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-
-	; TR
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
-
-	; BL
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-
-	; BR
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-%endif
-%endmacro
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureLuma_sse2(	uint8_t *pDst,
-;									const int32_t kiStride,
-;									const int32_t kiWidth,
-;									const int32_t kiHeight	);
-;***********************************************************************----------------
-ExpandPictureLuma_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	; for both top and bottom border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; pDst
-	mov edx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov eax, [esp+36]						; kiHeight
-	; also prepare for cross border pData top-left: xmm3
-;	xor ecx, ecx
-	mov cl, byte [esi]
-	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
-	; load top border
-	mov ecx, edx							; kiStride
-	neg ecx 								; -kiStride
-	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border
-	dec eax									; h-1
-	imul eax, edx 							; (h-1)*kiStride
-	lea eax, [esi+eax]						; last line of picture pData
-	sal edx, 05h							; 32*kiStride
-	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
-	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
-	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]
-;	xor edx, edx
-	mov dl, byte [eax]						; bottom-left
-	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	mov dl, byte [ebx]						; bottom-right
-	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding
-	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	32
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst: left border pSrc
-	mov edx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov ecx, [esp+36]						; kiHeight
-	; load left border
-	mov eax, -32 							; luma=-32, chroma=-16
-	lea edi, [esi+eax]						; left border dst
-	dec ebx
-	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst
-	; prepare for cross border pData: top-right with xmm4
-;	xor eax, eax
-	mov al, byte [ebx]						; top-right
-	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
-	; for left & right border expanding
-	exp_left_right_sse2	32, a
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; pDst
-	mov ecx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov edx, [esp+36]						; kiHeight
-	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-	mov eax, -32							; luma=-32, chroma=-16
-	neg ecx										; -stride
-	lea edi, [esi+eax]
-	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]
-	lea ebp, [ebp+ecx]				; last line of top-right border
-	add edx, 32								; height+32(16), luma=32, chroma=16
-	mov ecx, [esp+28]					; kiStride
-	imul edx, ecx							; (height+32(16)) * stride
-	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border
-	neg ecx										; -kiStride
-	; for left & right border expanding
-	exp_cross_sse2		32, a
-
-;	sfence									; commit cache write back memory
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-;										const int32_t kiStride,
-;										const int32_t kiWidth,
-;										const int32_t kiHeight	);
-;***********************************************************************----------------
-ExpandPictureChromaAlign_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	; for both top and bottom border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; pDst
-	mov edx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov eax, [esp+36]						; kiHeight
-	; also prepare for cross border pData top-left: xmm3
-;	xor ecx, ecx
-	mov cl, byte [esi]
-	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
-	; load top border
-	mov ecx, edx							; kiStride
-	neg ecx 								; -kiStride
-	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border
-	dec eax									; h-1
-	imul eax, edx 							; (h-1)*kiStride
-	lea eax, [esi+eax]						; last line of picture pData
-	sal edx, 04h							; 16*kiStride
-	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
-	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
-	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; pDst[w-1][h-1]
-;	xor edx, edx
-	mov dl, byte [eax]						; bottom-left
-	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	mov dl, byte [ebx]						; bottom-right
-	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding
-	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; pDst: left border pSrc
-	mov edx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov ecx, [esp+36]						; kiHeight
-	; load left border
-	mov eax, -16 							; luma=-32, chroma=-16
-	lea edi, [esi+eax]						; left border dst
-	dec ebx
-	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst
-	; prepare for cross border pData: top-right with xmm4
-;	xor eax, eax
-	mov al, byte [ebx]						; top-right
-	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
-	; for left & right border expanding
-	exp_left_right_sse2	16, a
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; pDst
-	mov ecx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov edx, [esp+36]						; kiHeight
-	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-	mov eax, -16							; chroma=-16
-	neg ecx										; -stride
-	lea edi, [esi+eax]
-	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]
-	lea ebp, [ebp+ecx]				; last line of top-right border
-	mov ecx, [esp+28]						; kiStride
-	add edx, 16							; height+16, luma=32, chroma=16
-	imul edx, ecx							; (kiHeight+16) * kiStride
-	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border
-	neg ecx										; -kiStride
-	; for left & right border expanding
-	exp_cross_sse2		16, a
-
-;	sfence									; commit cache write back memory
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-;										const int32_t kiStride,
-;										const int32_t kiWidth,
-;										const int32_t kiHeight	);
-;***********************************************************************----------------
-ExpandPictureChromaUnalign_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	; for both top and bottom border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; pDst
-	mov edx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov eax, [esp+36]						; kiHeight
-	; also prepare for cross border pData top-left: xmm3
-;	xor ecx, ecx
-	mov cl, byte [esi]
-	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
-	; load top border
-	mov ecx, edx							; kiStride
-	neg ecx 								; -kiStride
-	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border
-	dec eax									; h-1
-	imul eax, edx 							; (h-1)*kiStride
-	lea eax, [esi+eax]						; last line of picture pData
-	sal edx, 04h							; 16*kiStride
-	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
-	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
-	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]
-;	xor edx, edx
-	mov dl, byte [eax]						; bottom-left
-	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	mov dl, byte [ebx]						; bottom-right
-	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding
-	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst: left border pSrc
-	mov edx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov ecx, [esp+36]						; kiHeight
-	; load left border
-	mov eax, -16 							; luma=-32, chroma=-16
-	lea edi, [esi+eax]						; left border dst
-	dec ebx
-	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst
-	; prepare for cross border pData: top-right with xmm4
-;	xor eax, eax
-	mov al, byte [ebx]						; top-right
-	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for left & right border expanding
-	exp_left_right_sse2	16, u
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov ecx, [esp+28]						; kiStride
-	mov ebx, [esp+32]						; kiWidth
-	mov edx, [esp+36]						; kiHeight
-	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-	neg ecx									; -kiStride
-	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]
-	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]
-	lea ebp, [ebp+ecx]				; last line of top-right border
-	mov ecx, [esp+28]						; kiStride
-	add edx, 16							; kiHeight+16, luma=32, chroma=16
-	imul edx, ecx							; (kiHeight+16) * kiStride
-	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border
-	neg ecx									; -kiStride
-	; for left & right border expanding
-	exp_cross_sse2		16, u
-
-;	sfence									; commit cache write back memory
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-
-	ret
-
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -45,7 +45,6 @@
 ;*************************************************************************/
 
 %include "asm_inc.asm"
-BITS 32
 ;*******************************************************************************
 ; Local Data (Read Only)
 ;*******************************************************************************
@@ -166,11 +165,11 @@
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
-        lea         eax, [eax+2*ecx]
-        movzx		edx, byte [eax-0x01]
-        add			ebx, edx
-        movzx		edx, byte [eax+ecx-0x01]
-        add			ebx, edx
+        lea         r0, [r0+2*r1]
+        movzx		r3, byte [r0-0x01]
+        add			r2, r3
+        movzx		r3, byte [r0+r1-0x01]
+        add			r2, r3
 %endmacro
 
 ;*******************************************************************************
@@ -190,32 +189,37 @@
 ;	pPred must align to 16
 ;*******************************************************************************
 WelsI4x4LumaPredH_sse2:
-	mov			eax,	[esp+4]			;pPred
-	mov			ecx,	[esp+8]			;kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	;mov			eax,	[esp+4]			;pPred
+	;mov			ecx,	[esp+8]			;kiStride
 
-	movzx		edx,	byte [eax-1]
-	movd		xmm0,	edx
+	movzx		r2,	byte [r0-1]
+	movd		xmm0,	r2d
 	pmuludq		xmm0,	[mmx_01bytes]
 
-	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm1,	edx
+	movzx		r2,	byte [r0+r1-1]
+	movd		xmm1,	r2d
 	pmuludq		xmm1,	[mmx_01bytes]
 
-	lea			eax,	[eax+ecx]
-	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm2,	edx
+	lea			r0,	[r0+r1]
+	movzx		r2,	byte [r0+r1-1]
+	movd		xmm2,	r2d
 	pmuludq		xmm2,	[mmx_01bytes]
 
-	movzx		edx,	byte [eax+2*ecx-1]
-	movd		xmm3,	edx
+	movzx		r2,	byte [r0+2*r1-1]
+	movd		xmm3,	r2d
 	pmuludq		xmm3,	[mmx_01bytes]
 
-	sub         eax,    ecx
-	movd        [eax], xmm0
-	movd        [eax+ecx], xmm1
-	lea         eax, [eax+2*ecx]
-	movd        [eax], xmm2
-	movd        [eax+ecx], xmm3
+	sub         r0,    r1
+	movd        [r0], xmm0
+	movd        [r0+r1], xmm1
+	lea         r0, [r0+2*r1]
+	movd        [r0], xmm2
+	movd        [r0+r1], xmm3
 
 	ret
 
@@ -223,20 +227,28 @@
 ; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WelsI16x16LumaPredPlane_sse2:
-%define pushsize	4
-		push	esi
-		mov		esi,	[esp + pushsize + 4]
-		mov		ecx,	[esp + pushsize + 8]
-		sub		esi,	1
-		sub		esi,	ecx
+		;%define pushsize	4
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_2_PARA
+		%ifndef X86_32
+		movsx r1, r1d
+		%endif
+		mov r4, r0 ; save r0 in r4
+		;push	esi
+		;mov		esi,	[esp + pushsize + 4]
+		;mov		ecx,	[esp + pushsize + 8]
+		sub		r0,	1
+		sub		r0,	r1
 
 		;for H
 		pxor	xmm7,	xmm7
-		movq	xmm0,	[esi]
+		movq	xmm0,	[r0]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
 		pmullw	xmm0,	xmm5
-		movq	xmm1,	[esi + 9]
+		movq	xmm1,	[r0 + 9]
 		movdqa	xmm6,	[sse2_plane_inc]
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
@@ -243,25 +255,25 @@
 		psubw	xmm1,	xmm0
 
 		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	eax,	ax
-		imul	eax,	5
-		add		eax,	32
-		sar		eax,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
+		movd    r2d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	r2,	r2w
+		imul	r2,	5
+		add		r2,	32
+		sar		r2,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, r2d	; xmm1 = b,b,b,b,b,b,b,b
 
-		movzx	edx,	BYTE [esi+16]
-		sub	esi, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
+		movzx	r3,	BYTE [r0+16]
+		sub	r0, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r0, r1
 
-		add		esi,	3
-		movzx	eax,	BYTE [esi+8*ecx]
-		add		edx,	eax
-		shl		edx,	4			;	a = (left[15*kiStride] + top[15]) << 4;
+		add		r0,	3
+		movzx	r2,	BYTE [r0+8*r1]
+		add		r3,	r2
+		shl		r3,	4			;	a = (left[15*kiStride] + top[15]) << 4;
 
-		sub	esi, 3
-		add		esi,	ecx
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
+		sub	r0, 3
+		add		r0,	r1
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r0, r1
 		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
@@ -270,21 +282,22 @@
 		psubw	xmm7,	xmm0
 
 		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    eax,   xmm7			; V
-		movsx	eax,	ax
+		movd    r2d,   xmm7			; V
+		movsx	r2,	r2w
 
-		imul	eax,	5
-		add		eax,	32
-		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+		imul	r2,	5
+		add		r2,	32
+		sar		r2,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, r2d		; xmm4 = c,c,c,c,c,c,c,c
 
-		mov		esi,	[esp + pushsize + 4]
-		add		edx,	16
-		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+		;mov		esi,	[esp + pushsize + 4]
+		mov r0, r4
+		add		r3,	16
+		imul	r2,	-7
+		add		r3,	r2		; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		eax,	eax
+		xor		r2,	r2
 		movdqa	xmm5,	[sse2_plane_inc_minus]
 
 get_i16x16_luma_pred_plane_sse2_1:
@@ -297,14 +310,16 @@
 		paddw	xmm3,	xmm0
 		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
-		movdqa	[esi],	xmm2
+		movdqa	[r0],	xmm2
 		paddw	xmm0,	xmm4
-		add		esi,	ecx
-		inc		eax
-		cmp		eax,	16
+		add		r0,	r1
+		inc		r2
+		cmp		r2,	16
 		jnz get_i16x16_luma_pred_plane_sse2_1
 
-		pop		esi
+		;pop		esi
+		pop r4
+		pop r3
 		ret
 
 
@@ -313,32 +328,37 @@
 ; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 
-%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
-    lea     eax,	[eax+ecx*2]
+%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
+    lea     %1,	[%1+%2*2]
 
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [eax],	xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [eax+ecx],	xmm0
+    COPY_16_TIMES %1,	xmm0
+    movdqa  [%1],	xmm0
+    COPY_16_TIMESS %1,	xmm0,	%2
+    movdqa  [%1+%2],	xmm0
 %endmacro
 
 WELS_EXTERN WelsI16x16LumaPredH_sse2
 WelsI16x16LumaPredH_sse2:
-    mov     eax, [esp+4]    ; pPred
-    mov     ecx, [esp+8]    ; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+    ;mov     eax, [esp+4]    ; pPred
+    ;mov     ecx, [esp+8]    ; kiStride
 
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [eax],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [eax+ecx],	xmm0
+    COPY_16_TIMES r0,	xmm0
+    movdqa  [r0],		xmm0
+    COPY_16_TIMESS r0,	xmm0,	r1
+    movdqa  [r0+r1],	xmm0
 
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
-	SSE2_PRED_H_16X16_TWO_LINE_DEC
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
 
     ret
 
@@ -347,36 +367,41 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredV_sse2
 WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pPred
-    mov     ecx, [esp+8]    ; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+    ;mov     edx, [esp+4]    ; pPred
+    ;mov     ecx, [esp+8]    ; kiStride
 
-    sub     edx, ecx
-    movdqa  xmm0, [edx]
+    sub     r0, r1
+    movdqa  xmm0, [r0]
 
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
-    movdqa  [edx+ecx], xmm0
-    lea     edx, [edx+2*ecx]
-    movdqa  [edx],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
+    movdqa  [r0+r1], xmm0
+    lea     r0, [r0+2*r1]
+    movdqa  [r0],     xmm0
 
     ret
 
@@ -385,19 +410,27 @@
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredPlane_sse2
 WelsIChromaPredPlane_sse2:
-%define pushsize	4
-		push	esi
-		mov		esi,	[esp + pushsize + 4]	;pPred
-		mov		ecx,	[esp + pushsize + 8]	;kiStride
-		sub		esi,	1
-		sub		esi,	ecx
+		;%define pushsize	4
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_2_PARA
+		%ifndef X86_32
+		movsx r1, r1d
+		%endif
+		mov r4, r0
+		;push	esi
+		;mov		esi,	[esp + pushsize + 4]	;pPred
+		;mov		ecx,	[esp + pushsize + 8]	;kiStride
+		sub		r0,	1
+		sub		r0,	r1
 
 		pxor	mm7,	mm7
-		movq	mm0,	[esi]
+		movq	mm0,	[r0]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
 		pmullw	mm0,	mm5
-		movq	mm1,	[esi + 5]
+		movq	mm1,	[r0 + 5]
 		movq	mm6,	[sse2_plane_inc_c]
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
@@ -406,25 +439,25 @@
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    eax,	xmm1
-		movsx	eax,	ax
-		imul	eax,	17
-		add		eax,	16
-		sar		eax,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
+		movd    r2d,	xmm1
+		movsx	r2,	r2w
+		imul	r2,	17
+		add		r2,	16
+		sar		r2,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, r2d	; mm1 = b,b,b,b,b,b,b,b
 
-		movzx	edx,	BYTE [esi+8]
-		sub	esi, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
+		movzx	r3,	BYTE [r0+8]
+		sub	r0, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r0, r1
 
-		add		esi,	3
-		movzx	eax,	BYTE [esi+4*ecx]
-		add		edx,	eax
-		shl		edx,	4			; a = (left[7*kiStride] + top[7]) << 4;
+		add		r0,	3
+		movzx	r2,	BYTE [r0+4*r1]
+		add		r3,	r2
+		shl		r3,	4			; a = (left[7*kiStride] + top[7]) << 4;
 
-		sub	esi, 3
-		add		esi,	ecx
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
+		sub	r0, 3
+		add		r0,	r1
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r0, r1
 		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
@@ -435,21 +468,22 @@
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    eax,    xmm7			; V
-		movsx	eax,	ax
+		movd    r2d,    xmm7			; V
+		movsx	r2,	r2w
 
-		imul	eax,	17
-		add		eax,	16
-		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+		imul	r2,	17
+		add		r2,	16
+		sar		r2,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
 
-		mov		esi,	[esp + pushsize + 4]
-		add		edx,	16
-		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+		;mov		esi,	[esp + pushsize + 4]
+		mov 	r0, r4 
+		add		r3,	16
+		imul	r2,	-3
+		add		r3,	r2				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		eax,	eax
+		xor		r2,	r2
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
 
 get_i_chroma_pred_plane_sse2_1:
@@ -458,14 +492,16 @@
 		paddw	xmm2,	xmm0
 		psraw	xmm2,	5
 		packuswb xmm2,	xmm2
-		movq	[esi],	xmm2
+		movq	[r0],	xmm2
 		paddw	xmm0,	xmm4
-		add		esi,	ecx
-		inc		eax
-		cmp		eax,	8
+		add		r0,	r1
+		inc		r2
+		cmp		r2,	8
 		jnz get_i_chroma_pred_plane_sse2_1
 
-		pop		esi
+		;pop		esi
+		pop r4
+		pop r3
 		WELSEMMS
 		ret
 
@@ -483,27 +519,33 @@
 ;
 ;*******************************************************************************
 WelsI4x4LumaPredDDR_mmx:
-	mov			edx,[esp+4]			;pPred
-	mov         eax,edx
-	mov			ecx,[esp+8]		;kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx,[esp+4]			;pPred
+	;mov         eax,edx
+	;mov			ecx,[esp+8]		;kiStride
 
-	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
-	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[eax-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[eax]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	movq        mm1,[r2+r1-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[r2-8]			;get value of 6 mm2[8] = 6
+	sub		r2, r1			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[r2-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[r2]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
 	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
 	psllq       mm3,18h				;mm3[5]=[1]
 	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
 	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
 	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea			eax,[eax+ecx*2-8h]		;set eax point to 12
-	movq        mm4,[eax+ecx]		;get value of 16, mm4[8]=[16]
+	lea 		r2,[r2+r1*2-8h]		;set eax point to 12
+	movq        mm4,[r2+r1]		;get value of 16, mm4[8]=[16]
 	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
 	psrlq       mm4,38h				;mm4[1]=[16]
 	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
 	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[eax+ecx*2]		;mm4[8]=[21]
+	movq        mm4,[r2+r1*2]		;mm4[8]=[21]
 	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
 	psrlq       mm4,38h				;mm4[1]=[21]
 	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
@@ -514,15 +556,15 @@
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
 
-	lea         edx,[edx+ecx]
-	movd        [edx+2*ecx],mm2
-	sub         edx,ecx
+	lea         r0,[r0+r1]
+	movd        [r0+2*r1],mm2
+	sub         r0,r1
 	psrlq       mm2,8
-	movd        [edx+2*ecx],mm2
+	movd        [r0+2*r1],mm2
 	psrlq       mm2,8
-	movd        [edx+ecx],mm2
+	movd        [r0+r1],mm2
 	psrlq       mm2,8
-	movd        [edx],mm2
+	movd        [r0],mm2
 	WELSEMMS
 	ret
 
@@ -540,41 +582,52 @@
 ;
 ;*******************************************************************************
 WelsI4x4LumaPredDc_sse2:
-	mov         eax,[esp+4]			;pPred
-	mov			ecx,[esp+8]			;kiStride
-	push		ebx
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r4, r0
+	;mov         eax,[esp+4]			;pPred
+	;mov			ecx,[esp+8]			;kiStride
+	;push		ebx
 
-	movzx		edx,	byte [eax-1h]
+	movzx		r2,	byte [r0-1h]
 
-	sub			eax,	ecx
-	movd		xmm0,	[eax]
+	sub			r0,	r1
+	movd		xmm0,	[r0]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
 
-	movd		ebx,	xmm0
-	add			ebx,	edx
+	movd		r3d,	xmm0
+	add			r3,	r2
 
-	movzx		edx,	byte [eax+ecx*2-1h]
-	add			ebx,	edx
+	movzx		r2,	byte [r0+r1*2-1h]
+	add			r3,	r2
 
-	lea			eax,	[eax+ecx*2-1]
-	movzx		edx,	byte [eax+ecx]
-	add			ebx,	edx
+	lea			r0,	[r0+r1*2-1]
+	movzx		r2,	byte [r0+r1]
+	add			r3,	r2
 
-	movzx		edx,	byte [eax+ecx*2]
-	add			ebx,	edx
-	add			ebx,	4
-	sar			ebx,	3
-	imul		ebx,	0x01010101
+	movzx		r2,	byte [r0+r1*2]
+	add			r3,	r2
+	add			r3,	4
+	sar			r3,	3
+	imul		r3,	0x01010101
 
-	mov			edx,	[esp+8]			;pPred
-	mov         [edx],       ebx
-	mov         [edx+ecx],   ebx
-	mov         [edx+2*ecx], ebx
-	lea         edx, [edx+2*ecx]
-	mov         [edx+ecx],   ebx
+	;mov			edx,	[esp+8]			;pPred
+	mov			r0, r4
+	mov         [r0],       r3d
+	mov         [r0+r1],   r3d
+	mov         [r0+2*r1], r3d
+	lea         r0, [r0+2*r1]
+	mov         [r0+r1],   r3d
 
-	pop ebx
+	;pop ebx
+	pop r4
+	pop r3
 	ret
 
 ALIGN 16
@@ -592,7 +645,7 @@
 %endmacro
 
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+ecx-8]
+	movq		%1,		[%3+r1-8]
 	psrlq		%1,		38h
 
 	pmullw		%1,		[mmx_01bytes]
@@ -602,60 +655,47 @@
 
 WELS_EXTERN WelsIChromaPredH_mmx
 WelsIChromaPredH_mmx:
-	mov			edx,	[esp+4]			;pPred
-	mov         eax,	edx
-	mov			ecx,	[esp+8]			;kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx,	[esp+4]			;pPred
+	;mov         eax,	edx
+	;mov			ecx,	[esp+8]			;kiStride
 
-	movq		mm0,	[eax-8]
+	movq		mm0,	[r2-8]
 	psrlq		mm0,	38h
 
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
-	movq		[edx],	mm0
+	movq		[r0],	mm0
 
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			eax, [eax+ecx*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
+	lea			r2, [r2+r1*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
 
-	lea         edx, [edx+2*ecx]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+	lea         r0, [r0+2*r1]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			eax, [eax+ecx*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
+	lea			r2, [r2+r1*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
 
-	lea         edx, [edx+2*ecx]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+	lea         r0, [r0+2*r1]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			eax, [eax+ecx*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
+	lea			r2, [r2+r1*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
 
-    lea         edx, [edx+2*ecx]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+    	lea         r0, [r0+2*r1]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
 	WELSEMMS
 	ret
 
-ALIGN 16
-;*******************************************************************************
-;	void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
-;   copy pixels from top 4 pixels
-;*******************************************************************************
-WELS_EXTERN get_i4x4_luma_pred_v_asm
-get_i4x4_luma_pred_v_asm:
-	mov			eax,	[esp+4]        ;pPred
-	mov			ecx,	[esp+8]        ;kiStride
 
-	sub			eax,	ecx
-	mov         edx,    [eax]
-	mov		    [eax+ecx],	 edx
-	mov			[eax+2*ecx], edx
-	lea         eax, [eax+2*ecx]
-	mov			[eax+ecx],	 edx
-	mov			[eax+2*ecx], edx
-
-	ret
-
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -663,23 +703,28 @@
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredV_mmx
 WelsIChromaPredV_mmx:
-	mov			eax,		[esp+4]    ;pPred
-	mov			ecx,		[esp+8]    ;kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	;mov			eax,		[esp+4]    ;pPred
+	;mov			ecx,		[esp+8]    ;kiStride
 
-	sub			eax,		ecx
-	movq		mm0,		[eax]
+	sub			r0,		r1
+	movq		mm0,		[r0]
 
-	movq		[eax+ecx],		mm0
-	movq		[eax+2*ecx],	mm0
-	lea         eax, [eax+2*ecx]
-	movq		[eax+ecx],      mm0
-	movq		[eax+2*ecx],    mm0
-	lea         eax, [eax+2*ecx]
-	movq		[eax+ecx],      mm0
-	movq		[eax+2*ecx],    mm0
-	lea         eax, [eax+2*ecx]
-	movq		[eax+ecx],      mm0
-	movq		[eax+2*ecx],    mm0
+	movq		[r0+r1],		mm0
+	movq		[r0+2*r1],	mm0
+	lea         r0, [r0+2*r1]
+	movq		[r0+r1],      mm0
+	movq		[r0+2*r1],    mm0
+	lea         r0, [r0+2*r1]
+	movq		[r0+r1],      mm0
+	movq		[r0+2*r1],    mm0
+	lea         r0, [r0+2*r1]
+	movq		[r0+r1],      mm0
+	movq		[r0+2*r1],    mm0
 
 	WELSEMMS
 	ret
@@ -717,18 +762,24 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
 WelsI4x4LumaPredHD_mmx:
-	mov			edx, [esp+4]			; pPred
-	mov         eax, edx
-	mov			ecx, [esp+8]            ; kiStride
-	sub         eax, ecx
-	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx, [esp+4]			; pPred
+	;mov         eax, edx
+	;mov			ecx, [esp+8]            ; kiStride
+	sub         r2, r1
+	movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
 
-	movd        mm1, [eax+2*ecx-4]
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]
-	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
+	movd        mm1, [r2+2*r1-4]
+	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r2, [r2+2*r1]
+	movd        mm2, [r2+2*r1-4]
+	punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
@@ -758,14 +809,14 @@
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
 
-	movd        [edx], mm2
-	lea         edx, [edx+ecx]
-	movd        [edx+2*ecx], mm3
-	sub         edx, ecx
+	movd        [r0], mm2
+	lea         r0, [r0+r1]
+	movd        [r0+2*r1], mm3
+	sub         r0, r1
 	psrlq       mm3, 10h
-	movd        [edx+2*ecx], mm3
+	movd        [r0+2*r1], mm3
 	psrlq       mm3, 10h
-	movd        [edx+ecx], mm3
+	movd        [r0+r1], mm3
 	WELSEMMS
 	ret
 
@@ -800,15 +851,21 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
 WelsI4x4LumaPredHU_mmx:
-	mov			edx, [esp+4]			; pPred
-	mov         eax, edx
-	mov			ecx, [esp+8]            ; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx, [esp+4]			; pPred
+	;mov         eax, edx
+	;mov			ecx, [esp+8]            ; kiStride
 
-	movd        mm0, [eax-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax-4]            ; mm2[3] = l2
-	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
+	movd        mm0, [r2-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         r2, [r2+2*r1]
+	movd        mm2, [r2-4]            ; mm2[3] = l2
+	movd        mm4, [r2+r1-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
 
@@ -841,15 +898,15 @@
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
 
 	psrlq       mm4, 20h
-	lea         edx, [edx+ecx]
-	movd        [edx+2*ecx], mm4
+	lea         r0, [r0+r1]
+	movd        [r0+2*r1], mm4
 
-	sub         edx, ecx
-	movd        [edx], mm1
+	sub         r0, r1
+	movd        [r0], mm1
 	psrlq       mm1, 10h
-	movd        [edx+ecx], mm1
+	movd        [r0+r1], mm1
 	psrlq       mm1, 10h
-	movd        [edx+2*ecx], mm1
+	movd        [r0+2*r1], mm1
 	WELSEMMS
 	ret
 
@@ -886,17 +943,23 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
 WelsI4x4LumaPredVR_mmx:
-	mov			edx, [esp+4]			; pPred
-	mov         eax, edx
-	mov			ecx, [esp+8]            ; kiStride
-	sub         eax, ecx
-	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx, [esp+4]			; pPred
+	;mov         eax, edx
+	;mov			ecx, [esp+8]            ; kiStride
+	sub         r2, r1
+	movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
 
-	movd        mm1, [eax+2*ecx-4]
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         eax, [eax+2*ecx]
-	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
+	movd        mm1, [r2+2*r1-4]
+	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r2, [r2+2*r1]
+	movq        mm2, [r2+r1-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
@@ -920,10 +983,10 @@
 	movq        mm2, mm3
 
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [edx], mm1
+	movd        [r0], mm1
 
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [edx+ecx], mm2
+	movd        [r0+r1], mm2
 
 	movq        mm4, mm3
 	psllq       mm4, 20h
@@ -935,12 +998,12 @@
 
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [edx+2*ecx], mm4
+	movd        [r0+2*r1], mm4
 
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	lea         edx, [edx+2*ecx]
-	movd        [edx+ecx], mm5
+	lea         r0, [r0+2*r1]
+	movd        [r0+r1], mm5
 	WELSEMMS
 	ret
 
@@ -973,11 +1036,17 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
 WelsI4x4LumaPredDDL_mmx:
-	mov			edx, [esp+4]			; pPred
-	mov         eax, edx
-	mov			ecx, [esp+8]            ; kiStride
-	sub         eax, ecx
-	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx, [esp+4]			; pPred
+	;mov         eax, edx
+	;mov			ecx, [esp+8]            ; kiStride
+	sub         r2, r1
+	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
 
@@ -998,14 +1067,14 @@
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
 
 	psrlq       mm0, 8h
-	movd        [edx], mm0
+	movd        [r0], mm0
 	psrlq       mm0, 8h
-	movd        [edx+ecx], mm0
+	movd        [r0+r1], mm0
 	psrlq       mm0, 8h
-	movd        [edx+2*ecx], mm0
+	movd        [r0+2*r1], mm0
 	psrlq       mm0, 8h
-	lea         edx, [edx+2*ecx]
-	movd        [edx+ecx], mm0
+	lea         r0, [r0+2*r1]
+	movd        [r0+r1], mm0
 	WELSEMMS
 	ret
 
@@ -1042,12 +1111,18 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
 WelsI4x4LumaPredVL_mmx:
-	mov			edx, [esp+4]			; pPred
-	mov         eax, edx
-	mov			ecx, [esp+8]            ; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	;mov			edx, [esp+4]			; pPred
+	;mov         eax, edx
+	;mov			ecx, [esp+8]            ; kiStride
 
-	sub         eax, ecx
-	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	sub         r2, r1
+	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
 
@@ -1065,14 +1140,14 @@
 
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
 
-	movd        [edx], mm3
+	movd        [r0], mm3
 	psrlq       mm3, 8h
-	movd        [edx+2*ecx], mm3
+	movd        [r0+2*r1], mm3
 
-	movd        [edx+ecx], mm2
+	movd        [r0+r1], mm2
 	psrlq       mm2, 8h
-	lea         edx, [edx+2*ecx]
-	movd        [edx+ecx], mm2
+	lea         r0, [r0+2*r1]
+	movd        [r0+r1], mm2
 	WELSEMMS
 	ret
 
@@ -1083,34 +1158,42 @@
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
 WelsIChromaPredDc_sse2:
-	push        ebx
-	mov         eax, [esp+8]			; pPred
-	mov			ecx, [esp+12]           ; kiStride
+	push 	r3
+	push 	r4
+	%assign push_num 2
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r4, r0
+	;push        ebx
+	;mov         eax, [esp+8]			; pPred
+	;mov			ecx, [esp+12]           ; kiStride
 
-	sub         eax, ecx
-	movq        mm0, [eax]
+	sub         r0, r1
+	movq        mm0, [r0]
 
-	movzx		ebx, byte [eax+ecx-0x01] ; l1
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l2
-	add			ebx, edx
-	movzx		edx, byte [eax+ecx-0x01] ; l3
-	add			ebx, edx
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l4
-	add			ebx, edx
-	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
+	movzx		r2, byte [r0+r1-0x01] ; l1
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l2
+	add			r2, r3
+	movzx		r3, byte [r0+r1-0x01] ; l3
+	add			r2, r3
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l4
+	add			r2, r3
+	movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
 
-	movzx		ebx, byte [eax+ecx-0x01] ; l5
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l6
-	add			ebx, edx
-	movzx		edx, byte [eax+ecx-0x01] ; l7
-	add			ebx, edx
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l8
-	add			ebx, edx
-	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
+	movzx		r2, byte [r0+r1-0x01] ; l5
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l6
+	add			r2, r3
+	movzx		r3, byte [r0+r1-0x01] ; l7
+	add			r2, r3
+	lea         r0, [r0+2*r1]
+	movzx		r3, byte [r0-0x01]     ; l8
+	add			r2, r3
+	movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
 
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
@@ -1150,22 +1233,24 @@
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
 
-	mov         edx, [esp+8]			 ; pPred
+	;mov         edx, [esp+8]			 ; pPred
 
-	movq        [edx],       mm0
-	movq        [edx+ecx],   mm0
-	movq        [edx+2*ecx], mm0
-	lea         edx, [edx+2*ecx]
-	movq        [edx+ecx],   mm0
+	movq        [r4],       mm0
+	movq        [r4+r1],   mm0
+	movq        [r4+2*r1], mm0
+	lea         r4, [r4+2*r1]
+	movq        [r4+r1],   mm0
 
-	movq        [edx+2*ecx], mm1
-	lea         edx, [edx+2*ecx]
-	movq        [edx+ecx],   mm1
-	movq        [edx+2*ecx], mm1
-	lea         edx, [edx+2*ecx]
-	movq        [edx+ecx],   mm1
+	movq        [r4+2*r1], mm1
+	lea         r4, [r4+2*r1]
+	movq        [r4+r1],   mm1
+	movq        [r4+2*r1], mm1
+	lea         r4, [r4+2*r1]
+	movq        [r4+r1],   mm1
 
-	pop         ebx
+	;pop         ebx
+	pop r4
+	pop r3
 	WELSEMMS
 	ret
 
@@ -1178,12 +1263,19 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
 WelsI16x16LumaPredDc_sse2:
-	push        ebx
-	mov         eax, [esp+8]			; pPred
-	mov			ecx, [esp+12]           ; kiStride
-
-	sub         eax, ecx
-	movdqa      xmm0, [eax]             ; read one row
+	;push        ebx
+	;mov         eax, [esp+8]			; pPred
+	;mov			ecx, [esp+12]           ; kiStride
+	push 	r3
+	push 	r4
+	%assign push_num 2
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r4, r0
+	sub         r0, r1
+	movdqa      xmm0, [r0]             ; read one row
 	pxor		xmm1, xmm1
 	psadbw		xmm0, xmm1
 	movdqa      xmm1, xmm0
@@ -1192,10 +1284,10 @@
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
 
-	movzx		ebx, byte [eax+ecx-0x01]
-	movzx		edx, byte [eax+2*ecx-0x01]
-	add			ebx, edx
-	lea         eax, [eax+ecx]
+	movzx		r2, byte [r0+r1-0x01]
+	movzx		r3, byte [r0+2*r1-0x01]
+	add		r2, r3
+	lea    		r0, [r0+r1]
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
@@ -1203,47 +1295,49 @@
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
-	add         ebx, 0x10
-	movd        xmm1, ebx
+	add         r2, 0x10
+	movd        xmm1, r2d
 	paddw       xmm0, xmm1
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
 
-	mov         edx, [esp+8]			; pPred
+	;mov         edx, [esp+8]			; pPred
 
-	movdqa      [edx],       xmm0
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4],       xmm0
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
-	movdqa      [edx+2*ecx], xmm0
-	lea         edx,         [edx+2*ecx]
+	movdqa      [r4+r1],   xmm0
+	movdqa      [r4+2*r1], xmm0
+	lea         r4,         [r4+2*r1]
 
-	movdqa      [edx+ecx],   xmm0
+	movdqa      [r4+r1],   xmm0
 
-	pop         ebx
+	;pop         ebx
+	pop r4
+	pop r3
 
 	ret
 
@@ -1257,68 +1351,78 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
 WelsI16x16LumaPredDcTop_sse2:
-	push ebx
-
-	%define PUSH_SIZE 4
-
-	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
-
-	mov ecx, ebx
-	neg ecx
-	movdqa xmm0, [eax+ecx]		; pPred-kiStride, top line
+	;push ebx
+	;%define PUSH_SIZE 4
+	;mov eax, [esp+PUSH_SIZE+4]	; pPred
+	;mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	sub r2, r1
+	movdqa xmm0, [r2]		; pPred-kiStride, top line
 	pxor xmm7, xmm7
+	psadbw xmm0, xmm7
 	movdqa xmm1, xmm0
-	punpcklbw xmm0, xmm7
-	punpckhbw xmm1, xmm7
+	psrldq xmm1, 8
+	paddw  xmm0, xmm1
+	xor r2, r2
+	movd r2d, xmm0
+	;movdqa xmm1, xmm0
+	;punpcklbw xmm0, xmm7
+	;punpckhbw xmm1, xmm7
 
-	paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
-	pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
-	paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
-	pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
-	paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
-	pshuflw xmm1, xmm0, 0b1h	; 10110001
-	paddw xmm0, xmm1			; sum in word unit (x8)
-	movd edx, xmm0
-	and edx, 0ffffh
+	;paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+	;pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
+	;paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+	;pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+	;paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+	;pshuflw xmm1, xmm0, 0b1h	; 10110001
+	;paddw xmm0, xmm1			; sum in word unit (x8)
+	;xor r3, r3
+	;movd r3d, xmm0
+	;and edx, 0ffffh
 
-	add edx, 08h
-	sar edx, 04h
-	mov dh, dl
-	mov ecx, edx
-	shl ecx, 010h
-	or edx, ecx
-	movd xmm1, edx
-	pshufd xmm0, xmm1, 00h
-	movdqa xmm1, xmm0
+	add r2, 8
+	sar r2, 4
+	SSE2_Copy16Times xmm1, r2d
+	;mov dh, dl
+	;mov r2, edx
+	;shl r2, 010h
+	;or edx, r2
+	;movd xmm1, edx
+	;pshufd xmm0, xmm1, 00h
+	;movdqa xmm1, xmm0
+	movdqa xmm0, xmm1
+	lea r2, [2*r1+r1]		; 3*kiStride
 
-	lea ecx, [2*ebx+ebx]		; 3*kiStride
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
 
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
 
-	lea eax, [eax+4*ebx]
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
 
-	lea eax, [eax+4*ebx]
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
 
-	lea eax, [eax+4*ebx]
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
-
-	%undef PUSH_SIZE
-	pop ebx
+	;%undef PUSH_SIZE
+	;pop ebx
 	ret
 
 ALIGN 16
@@ -1327,40 +1431,44 @@
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
 WelsI16x16LumaPredDcNA_sse2:
-	push ebx
+	;push ebx
 
-	%define PUSH_SIZE	4
+	;%define PUSH_SIZE	4
 
-	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+	;mov eax, [esp+PUSH_SIZE+4]	; pPred
+	;mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	lea r2, [2*r1+r1]		; 3*kiStride
 
-	lea ecx, [2*ebx+ebx]		; 3*kiStride
-
 	movdqa xmm0, [sse2_dc_0x80]
 	movdqa xmm1, xmm0
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
-	lea eax, [eax+4*ebx]
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
-	lea eax, [eax+4*ebx]
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
-	lea eax, [eax+4*ebx]
-	movdqa [eax], xmm0
-	movdqa [eax+ebx], xmm1
-	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm0
+	movdqa [r0+r2], xmm1
 
-	%undef PUSH_SIZE
+	;%undef PUSH_SIZE
 
-	pop ebx
+	;pop ebx
 	ret
 
 ALIGN 16
@@ -1369,58 +1477,80 @@
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDcLeft_mmx
 WelsIChromaPredDcLeft_mmx:
-	push ebx
-	push esi
-	%define PUSH_SIZE 8
-	mov esi, [esp+PUSH_SIZE+4]	; pPred
-	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
-	mov eax, esi
+	;push ebx
+	;push esi
+	;%define PUSH_SIZE 8
+	;mov esi, [esp+PUSH_SIZE+4]	; pPred
+	;mov ecx, [esp+PUSH_SIZE+8]	; kiStride
+	;mov eax, esi
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r4, r0
 	; for left
-	dec eax
-	xor ebx, ebx
-	xor edx, edx
-	mov bl, [eax]
-	mov dl, [eax+ecx]
-	add ebx, edx
-	lea eax, [eax+2*ecx]
-	mov dl, [eax]
-	add ebx, edx
-	mov dl, [eax+ecx]
-	add ebx, edx
-	add ebx, 02h
-	sar ebx, 02h
-	mov bh, bl
-	movd mm1, ebx
-	pshufw mm0, mm1, 00h	; up64
+	dec r0
+	xor r2, r2
+	xor r3, r3
+	movzx r2, byte [r0]
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	lea r0, [r0+2*r1]
+	movzx r3, byte [r0]
+	add r2, r3
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	add r2, 02h
+	sar r2, 02h
+	;SSE2_Copy16Times mm0, r2d
+	mov r3, r2
+	sal r3, 8
+	or r2, r3
+	movd mm1, r2d
+	pshufw mm0, mm1, 00h
+	;mov bh, bl
+	;movd mm1, ebx
+	;pshufw mm0, mm1, 00h	; up64
 	movq mm1, mm0
-	xor ebx, ebx
-	lea eax, [eax+2*ecx]
-	mov bl, [eax]
-	mov dl, [eax+ecx]
-	add ebx, edx
-	lea eax, [eax+2*ecx]
-	mov dl, [eax]
-	add ebx, edx
-	mov dl, [eax+ecx]
-	add ebx, edx
-	add ebx, 02h
-	sar ebx, 02h
-	mov bh, bl
-	movd mm3, ebx
-	pshufw mm2, mm3, 00h	; down64
+	xor r2, r2
+	lea r0, [r0+2*r1]
+	movzx r2, byte [r0]
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	lea r0, [r0+2*r1]
+	movzx r3, byte [r0]
+	add r2, r3
+	movzx r3, byte [r0+r1]
+	add r2, r3
+	add r2, 02h
+	sar r2, 02h
+	mov r3, r2
+	sal r3, 8
+	or r2, r3
+	movd mm3, r2d
+	pshufw mm2, mm3, 00h
+	;mov bh, bl
+	;movd mm3, ebx
+	;pshufw mm2, mm3, 00h	; down64
+	;SSE2_Copy16Times mm2, r2d
 	movq mm3, mm2
-	lea ebx, [2*ecx+ecx]
-	movq [esi], mm0
-	movq [esi+ecx], mm1
-	movq [esi+2*ecx], mm0
-	movq [esi+ebx], mm1
-	lea esi, [esi+4*ecx]
-	movq [esi], mm2
-	movq [esi+ecx], mm3
-	movq [esi+2*ecx], mm2
-	movq [esi+ebx], mm3
-	pop esi
-	pop ebx
+	lea r2, [2*r1+r1]
+	movq [r4], mm0
+	movq [r4+r1], mm1
+	movq [r4+2*r1], mm0
+	movq [r4+r2], mm1
+	lea r4, [r4+4*r1]
+	movq [r4], mm2
+	movq [r4+r1], mm3
+	movq [r4+2*r1], mm2
+	movq [r4+r2], mm3
+	;pop esi
+	;pop ebx
+	pop r4
+	pop r3
 	emms
 	ret
 
@@ -1430,13 +1560,20 @@
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDcTop_sse2
 WelsIChromaPredDcTop_sse2:
-	push ebx
-	%define PUSH_SIZE 4
-	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
-	mov ebx, ecx
-	neg ebx
-	movq xmm0, [eax+ebx]		; top: 8x1 pixels
+	;push ebx
+	;%define PUSH_SIZE 4
+	;mov eax, [esp+PUSH_SIZE+4]	; pPred
+	;mov ecx, [esp+PUSH_SIZE+8]	; kiStride
+	;mov ebx, ecx
+	;neg ebx
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	mov r2, r0
+	sub r2, r1
+	movq xmm0, [r2]		; top: 8x1 pixels
 	pxor xmm7, xmm7
 	punpcklbw xmm0, xmm7		; ext 8x2 words
 	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
@@ -1452,21 +1589,20 @@
 	paddw xmm0, xmm6
 	psraw xmm0, 02h
 	packuswb xmm0, xmm7
-	lea ebx, [2*ecx+ecx]
-	movq [eax], xmm0
-	movq [eax+ecx], xmm0
-	movq [eax+2*ecx], xmm0
-	movq [eax+ebx], xmm0
-	lea eax, [eax+4*ecx]
-	movq [eax], xmm0
-	movq [eax+ecx], xmm0
-	movq [eax+2*ecx], xmm0
-	movq [eax+ebx], xmm0
-	%undef PUSH_SIZE
-	pop ebx
+	lea r2, [2*r1+r1]
+	movq [r0], xmm0
+	movq [r0+r1], xmm0
+	movq [r0+2*r1], xmm0
+	movq [r0+r2], xmm0
+	lea r0, [r0+4*r1]
+	movq [r0], xmm0
+	movq [r0+r1], xmm0
+	movq [r0+2*r1], xmm0
+	movq [r0+r2], xmm0
+	;%undef PUSH_SIZE
+	;pop ebx
 	ret
 
-
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1473,26 +1609,29 @@
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDcNA_mmx
 WelsIChromaPredDcNA_mmx:
-	push ebx
-	%define PUSH_SIZE 4
-	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
-	lea ecx, [2*ebx+ebx]
+	;push ebx
+	;%define PUSH_SIZE 4
+	;mov eax, [esp+PUSH_SIZE+4]	; pPred
+	;mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+	%assign push_num 0
+	LOAD_2_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	%endif
+	lea r2, [2*r1+r1]
 	movq mm0, [sse2_dc_0x80]
 	movq mm1, mm0
-	movq [eax], mm0
-	movq [eax+ebx], mm1
-	movq [eax+2*ebx], mm0
-	movq [eax+ecx], mm1
-	lea eax, [eax+4*ebx]
-	movq [eax], mm0
-	movq [eax+ebx], mm1
-	movq [eax+2*ebx], mm0
-	movq [eax+ecx], mm1
-	%undef PUSH_SIZE
-	pop ebx
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	movq [r0+2*r1], mm0
+	movq [r0+r2], mm1
+	lea r0, [r0+4*r1]
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	movq [r0+2*r1], mm0
+	movq [r0+r2], mm1
+	;%undef PUSH_SIZE
+	;pop ebx
 	emms
 	ret
-
-
 
--- a/codec/decoder/core/asm/mb_copy.asm
+++ /dev/null
@@ -1,330 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mb_copy.asm
-;*
-;*  Abstract
-;*      mb_copy and mb_copy1
-;*
-;*  History
-;*      15/09/2009 Created
-;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-;SECTION .rodata data align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN PixelAvgWidthEq4_mmx
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
-;                           uint8_t *pSrcA, int iSrcAStride,
-;                           uint8_t *pSrcB, int iSrcBStride,
-;                           int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq4_mmx:
-
-    push        esi
-    push        edi
-    push        ebp
-    push        ebx
-
-    mov         edi, [esp+20]       ; pDst
-    mov         eax, [esp+24]       ; iDstStride
-    mov         esi, [esp+28]       ; pSrcA
-    mov         ecx, [esp+32]       ; iSrcAStride
-    mov         ebp, [esp+36]       ; pSrcB
-    mov         edx, [esp+40]       ; iSrcBStride
-    mov         ebx, [esp+44]       ; iHeight
-ALIGN 4
-.height_loop:
-	movd        mm0, [ebp]
-    pavgb       mm0, [esi]
-    movd        [edi], mm0
-
-    dec         ebx
-    lea         edi, [edi+eax]
-    lea         esi, [esi+ecx]
-    lea         ebp, [ebp+edx]
-    jne         .height_loop
-
-	WELSEMMS
-    pop         ebx
-    pop         ebp
-    pop         edi
-    pop         esi
-    ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
-;                           uint8_t *pSrcA, int iSrcAStride,
-;                           uint8_t *pSrcB, int iSrcBStride,
-;                           int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq8_mmx:
-
-    push        esi
-    push        edi
-    push        ebp
-    push        ebx
-
-    mov         edi, [esp+20]       ; pDst
-    mov         eax, [esp+24]       ; iDstStride
-    mov         esi, [esp+28]       ; pSrcA
-    mov         ecx, [esp+32]       ; iSrcAStride
-    mov         ebp, [esp+36]       ; pSrcB
-    mov         edx, [esp+40]       ; iSrcBStride
-    mov         ebx, [esp+44]       ; iHeight
-ALIGN 4
-.height_loop:
-	movq        mm0, [esi]
-    pavgb       mm0, [ebp]
-    movq        [edi], mm0
-    movq        mm0, [esi+ecx]
-    pavgb       mm0, [ebp+edx]
-    movq		[edi+eax], mm0
-
-    lea			esi,  [esi+2*ecx]
-    lea			ebp, [ebp+2*edx]
-    lea			edi,  [edi+2*eax]
-
-    sub           ebx, 2
-    jnz         .height_loop
-
-	WELSEMMS
-    pop         ebx
-    pop         ebp
-    pop         edi
-    pop         esi
-    ret
-
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
-;                          uint8_t *pSrcA, int iSrcAStride,
-;                          uint8_t *pSrcB, int iSrcBStride,
-;                          int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq16_sse2:
-    push        esi
-    push        edi
-    push        ebp
-    push        ebx
-
-
-    mov         edi, [esp+20]       ; pDst
-    mov         eax, [esp+24]       ; iDstStride
-    mov         esi, [esp+28]       ; pSrcA
-    mov         ecx, [esp+32]       ; iSrcAStride
-    mov         ebp, [esp+36]       ; pSrcB
-    mov         edx, [esp+40]       ; iSrcBStride
-    mov         ebx, [esp+44]       ; iHeight
-ALIGN 4
-.height_loop:
-	movdqu      xmm0, [esi]
-	pavgb         xmm0, [ebp]
-    movdqu      [edi], xmm0
-
-	movdqu      xmm0, [esi+ecx]
-	pavgb         xmm0, [ebp+edx]
-    movdqu      [edi+eax], xmm0
-
-	movdqu      xmm0, [esi+2*ecx]
-	pavgb         xmm0, [ebp+2*edx]
-    movdqu      [edi+2*eax], xmm0
-
-    lea              esi,  [esi+2*ecx]
-    lea			   ebp, [ebp+2*edx]
-    lea			   edi,  [edi+2*eax]
-
-	movdqu      xmm0, [esi+ecx]
-	pavgb         xmm0, [ebp+edx]
-    movdqu      [edi+eax], xmm0
-
-    lea              esi,  [esi+2*ecx]
-    lea			   ebp, [ebp+2*edx]
-    lea			   edi,  [edi+2*eax]
-
-
-    sub         ebx, 4
-    jne         .height_loop
-
-	WELSEMMS
-	pop         ebx
-    pop         ebp
-    pop         edi
-    pop         esi
-
-    ret
-
-
-ALIGN 16
-;*******************************************************************************
-;  void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-;                          uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq4_mmx:
-    push    esi
-    push    edi
-    push    ebx
-
-
-    mov esi,  [esp+16]
-    mov eax, [esp+20]
-    mov edi,  [esp+24]
-    mov ecx,  [esp+28]
-    mov edx,  [esp+32]
-ALIGN 4
-.height_loop:
-	mov ebx, [esi]
-	mov [edi], ebx
-
-	add esi, eax
-	add edi, ecx
-	dec edx
-	jnz .height_loop
-	WELSEMMS
-	pop	   ebx
-    pop     edi
-    pop     esi
-    ret
-
-ALIGN 16
-;*******************************************************************************
-;   void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-;                           uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq8_mmx:
-    push  esi
-    push  edi
-	mov  esi, [esp+12]
-	mov eax, [esp+16]
-	mov edi, [esp+20]
-	mov ecx, [esp+24]
-	mov edx, [esp+28]
-
-ALIGN 4
-.height_loop:
-	movq mm0, [esi]
-	movq [edi], mm0
-	add esi, eax
-	add edi, ecx
-	dec edx
-	jnz .height_loop
-
-	WELSEMMS
-    pop     edi
-    pop     esi
-    ret
-
-
-
-
-
-
-
-
-ALIGN 16
-;*******************************************************************************
-;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-;read unaligned memory
-%macro SSE_READ_UNA 2
-	movq	%1, [%2]
-	movhps	%1,	[%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE_WRITE_UNA 2
-	movq	[%1],	%2
-	movhps	[%1+8], %2
-%endmacro
-McCopyWidthEq16_sse2:
-    push    esi
-    push    edi
-
-    mov     esi, [esp+12]       ; pSrc
-    mov     eax, [esp+16]       ; iSrcStride
-    mov     edi, [esp+20]       ; pDst
-    mov     edx, [esp+24]       ; iDstStride
-    mov     ecx, [esp+28]       ; iHeight
-
-ALIGN 4
-.height_loop:
-    SSE_READ_UNA	xmm0, esi
-    SSE_READ_UNA	xmm1, esi+eax
-    SSE_WRITE_UNA	edi, xmm0
-    SSE_WRITE_UNA	edi+edx, xmm1
-
-	sub		ecx,	2
-    lea     esi, [esi+eax*2]
-    lea     edi, [edi+edx*2]
-    jnz     .height_loop
-
-    pop     edi
-    pop     esi
-    ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ /dev/null
@@ -1,317 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-;							int32_t iSrcStride,
-;							uint8_t *pDst,
-;							int32_t iDstStride,
-;							uint8_t *pABCD,
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3
-	punpckhwd mm4, mm4
-
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-
-	mov esi, [esp +12+ 4]
-	mov eax, [esp + 12 + 8]
-	mov edi, [esp + 12 + 12]
-	mov edx, [esp + 12 + 16]
-    mov ecx, [esp + 12 + 24]
-
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0
-
-	movq mm0, mm2
-
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-;						int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						uint8_t *pABCD,
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-
-	mov esi, [esp +12+ 4]
-	mov eax, [esp + 12 + 8]
-	mov edi, [esp + 12 + 12]
-	mov edx, [esp + 12 + 16]
-    mov ecx, [esp + 12 + 24]
-
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0
-
-	movdqa xmm0, xmm2
-
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride,
-;                        uint8_t *pDst,
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]
-    punpcklwd xmm5, xmm5
-    punpckldq xmm5, xmm5
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6
-
-	mov eax, [esp + 12 + 4]
-	mov edx, [esp + 12 + 8]
-	mov esi, [esp + 12 + 12]
-	mov edi, [esp + 12 + 16]
-    mov ecx, [esp + 12 + 24]
-
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-
-.hloop_chroma:
-	lea	esi, [esi+2*edi]
-
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0
-
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4
-
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
--- a/codec/decoder/core/asm/mc_luma.asm
+++ /dev/null
@@ -1,615 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_luma.asm
-;*
-;*  Abstract
-;*      sse2 motion compensation
-;*
-;*  History
-;*      17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-SECTION .rodata align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10:
-	dw 16, 16, 16, 16
-
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20WidthEq4_mmx
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
-;                       int iSrcStride,
-;						uint8_t *pDst,
-;						int iDstStride,
-;						int iHeight)
-;*******************************************************************************
-McHorVer20WidthEq4_mmx:
-	push esi
-	push edi
-
-	mov  esi, [esp+12]
-	mov eax, [esp+16]
-	mov edi, [esp+20]
-	mov ecx, [esp+24]
-	mov edx, [esp+28]
-	sub esi, 2
-	WELS_Zero mm7
-	movq mm6, [h264_w0x10]
-.height_loop:
-	movd mm0, [esi]
-	punpcklbw mm0, mm7
-	movd mm1, [esi+5]
-	punpcklbw mm1, mm7
-	movd mm2, [esi+1]
-	punpcklbw mm2, mm7
-	movd mm3, [esi+4]
-	punpcklbw mm3, mm7
-	movd mm4, [esi+2]
-	punpcklbw mm4, mm7
-	movd mm5, [esi+3]
-	punpcklbw mm5, mm7
-
-	paddw mm2, mm3
-	paddw mm4, mm5
-	psllw mm4, 2
-	psubw mm4, mm2
-	paddw mm0, mm1
-	paddw mm0, mm4
-	psllw mm4, 2
-	paddw mm0, mm4
-	paddw mm0, mm6
-	psraw mm0, 5
-	packuswb mm0, mm7
-	movd [edi], mm0
-
-	add esi, eax
-	add edi, ecx
-	dec edx
-	jnz .height_loop
-
-	WELSEMMS
-	pop edi
-	pop esi
-	ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-
-%macro SSE_LOAD_8P 3
-	movq %1, %3
-	punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
-	paddw	%1, %6
-	movdqa	%8, %3
-	movdqa	%7, %2
-	paddw	%1, [h264_w0x10_1]
-	paddw	%8, %4
-	paddw	%7, %5
-	psllw	%8, 2
-	psubw	%8, %7
-	paddw	%1, %8
-	psllw	%8, 2
-	paddw	%1, %8
-	psraw   %1, 5
-	WELS_Zero %8
-	packuswb %1, %8
-	movq    %9, %1
-%endmacro
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-SECTION .rodata align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10_1:
-	dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
-dw 32, 32, 32, 32, 32, 32, 32, 32
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer22VerLast_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-
-ALIGN 16
-;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
-;                       int16_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride
-;						int32_t iHeight
-;                       )
-;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
-	push esi
-	push edi
-	push ebx
-	mov esi, [esp+16]     ;pSrc
-	mov eax, [esp+20]	;iSrcStride
-	mov edi, [esp+24]		;pDst
-	mov edx, [esp+28]	;iDstStride
-	mov ebx, [esp+32]	;iHeight
-	pxor xmm7, xmm7
-
-	sub esi, eax				;;;;;;;;need more 5 lines.
-	sub esi, eax
-
-.yloop_width_8:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [edi], xmm0
-
-	add esi, eax
-	add edi, edx
-	dec ebx
-	jnz .yloop_width_8
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-ALIGN 16
-;***********************************************************************
-;void_t McHorVer22VerLast_sse2(
-;											uint8_t *pSrc,
-;											int32_t pSrcStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
-%macro FILTER_VER 9
-	paddw  %1, %6
-	movdqa %7, %2
-	movdqa %8, %3
-
-
-	paddw %7, %5
-	paddw %8, %4
-
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %1, %8
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %8, %1
-	paddw  %8, [h264_mc_hc_32]
-	psraw   %8, 6
-	packuswb %8, %8
-	movq %9, %8
-%endmacro
-
-McHorVer22VerLast_sse2:
-	push esi
-	push edi
-	push ebx
-	push ebp
-
-	mov esi, [esp+20]
-	mov eax, [esp+24]
-	mov edi, [esp+28]
-	mov edx, [esp+32]
-	mov ebx, [esp+36]
-	mov ecx, [esp+40]
-	shr ebx, 3
-
-.width_loop:
-	movdqa xmm0, [esi]
-	movdqa xmm1, [esi+eax]
-	lea esi, [esi+2*eax]
-	movdqa xmm2, [esi]
-	movdqa xmm3, [esi+eax]
-	lea esi, [esi+2*eax]
-	movdqa xmm4, [esi]
-	movdqa xmm5, [esi+eax]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	lea esi, [esi+2*eax]
-	movdqa xmm6, [esi]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add edi, edx
-	sub esi, eax
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm6, [esi]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm7, [esi+eax]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm0, [esi]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm1, [esi+eax]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm2, [esi]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm3, [esi+eax]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm4, [esi]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm5, [esi+eax]
-	jmp near .start
-
-.x_loop_dec:
-	dec ebx
-	jz near .exit
-	mov esi, [esp+20]
-	mov edi, [esp+28]
-	mov ecx, [esp+40]
-	add esi, 16
-	add edi, 8
-	jmp .width_loop
-
-
-
-.exit:
-	pop ebp
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
-;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
-;                      );
-;*******************************************************************************
-McHorVer20WidthEq8_sse2:
-	push	esi
-	push	edi
-
-	mov esi, [esp + 12]         ;pSrc
-	mov eax, [esp + 16]         ;iSrcStride
-	mov edi, [esp + 20]         ;pDst
-	mov ecx, [esp + 28]         ;iHeight
-	mov edx, [esp + 24]			;iDstStride
-
-	lea esi, [esi-2]            ;pSrc -= 2;
-
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-
-	packuswb xmm0, xmm7
-	movq [edi], xmm0
-
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .y_loop
-
-	pop edi
-	pop esi
-	ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
-;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
-;                      );
-;*******************************************************************************
-McHorVer20WidthEq16_sse2:
-	push	esi
-	push	edi
-
-
-	mov esi, [esp + 12]         ;pSrc
-	mov eax, [esp + 16]         ;iSrcStride
-	mov edi, [esp + 20]         ;pDst
-	mov ecx, [esp + 28]         ;iHeight
-	mov edx, [esp + 24]			;iDstStride
-
-	lea esi, [esi-2]            ;pSrc -= 2;
-
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [edi], xmm0
-
-	movq xmm0, [esi+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3+8]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [edi+8], xmm0
-
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .y_loop
-	pop edi
-	pop esi
-	ret
-
-
-;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-;                       int iSrcStride,
-;                       uint8_t *pDst,
-;                       int iDstStride,
-;                       int iHeight )
-;*******************************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
-	push esi
-	push edi
-
-	mov esi, [esp + 12]           ;pSrc
-	mov edx, [esp + 16]	          ;iSrcStride
-	mov edi, [esp + 20]           ;pDst
-	mov eax, [esp + 24]           ;iDstStride
-	mov ecx, [esp + 28]           ;iHeight
-
-	sub esi, edx
-	sub esi, edx
-
-	WELS_Zero xmm7
-
-	SSE_LOAD_8P xmm0, xmm7, [esi]
-	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm2, xmm7, [esi]
-	SSE_LOAD_8P xmm3, xmm7, [esi+edx]
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm4, xmm7, [esi]
-	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm6, xmm7, [esi]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm0, xmm1, [esi]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm1, xmm2, [esi+edx]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm2, xmm3, [esi]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm3, xmm4, [esi+edx]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm4, xmm5, [esi]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
-	jmp near .start
-
-.xx_exit:
-	pop edi
-	pop esi
-	ret
-
-
--- a/codec/decoder/core/asm/memzero.asm
+++ /dev/null
@@ -1,135 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  memzero.asm
-;*
-;*  Abstract
-;*
-;*
-;*  History
-;*      9/16/2009 Created
-;*
-;*
-;*************************************************************************/
-
-BITS 32
-
-%include "asm_inc.asm"
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-ALIGN 16
-;***********************************************************************
-;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
-;***********************************************************************
-WELS_EXTERN WelsPrefetchZero_mmx
-WelsPrefetchZero_mmx:
-	mov  eax,[esp+4]
-	prefetchnta [eax]
-	ret
-
-
-ALIGN 16
-;***********************************************************************
-;   void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroAligned64_sse2
-WelsSetMemZeroAligned64_sse2:
-		mov		eax,	[esp + 4]          ; dst
-		mov		ecx,	[esp + 8]
-		neg		ecx
-
-		pxor	xmm0,		xmm0
-.memzeroa64_sse2_loops:
-		movdqa	[eax],		xmm0
-		movdqa	[eax+16],	xmm0
-		movdqa	[eax+32],	xmm0
-		movdqa	[eax+48],	xmm0
-		add		eax, 0x40
-
-		add ecx, 0x40
-		jnz near .memzeroa64_sse2_loops
-
-		ret
-
-ALIGN 16
-;***********************************************************************
-;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize64_mmx
-WelsSetMemZeroSize64_mmx:
-		mov		eax,	[esp + 4]          ; dst
-		mov		ecx,	[esp + 8]
-		neg		ecx
-
-		pxor	mm0,		mm0
-.memzero64_mmx_loops:
-		movq	[eax],		mm0
-		movq	[eax+8],	mm0
-		movq	[eax+16],	mm0
-		movq	[eax+24],	mm0
-		movq	[eax+32],	mm0
-		movq	[eax+40],	mm0
-		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0
-		add		eax,		0x40
-
-		add ecx, 0x40
-		jnz near .memzero64_mmx_loops
-
-		WELSEMMS
-		ret
-
-ALIGN 16
-;***********************************************************************
-;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize8_mmx
-WelsSetMemZeroSize8_mmx:
-		mov		eax,	[esp + 4]		; dst
-		mov		ecx,	[esp + 8]		; size
-		neg		ecx
-		pxor	mm0,		mm0
-
-.memzero8_mmx_loops:
-		movq	[eax],		mm0
-		add		eax,		0x08
-
-		add		ecx,		0x08
-		jnz near .memzero8_mmx_loops
-
-		WELSEMMS
-		ret
-
-
--- a/codec/decoder/core/inc/mc.h
+++ b/codec/decoder/core/inc/mc.h
@@ -78,7 +78,7 @@
                                        int32_t iHeight);
 extern void_t McHorVer22Width8HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
     int32_t iHeight);
-extern void_t McHorVer22VerLast_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+extern void_t McHorVer22Width8VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
                                       int32_t iWidth, int32_t iHeight);
 extern void_t PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
                                       uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -362,7 +362,7 @@
     int32_t iHeight) {
   ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
   McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
-  McHorVer22VerLast_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
 }
 
 static inline void_t McHorVer02WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -28,17 +28,9 @@
 DECODER_OBJS += $(DECODER_CPP_SRCS:.cpp=.o)
 ifeq ($(USE_ASM), Yes)
 DECODER_ASM_SRCS=\
-	$(DECODER_SRCDIR)/./core/asm/asm_inc.asm\
 	$(DECODER_SRCDIR)/./core/asm/block_add.asm\
-	$(DECODER_SRCDIR)/./core/asm/cpuid.asm\
 	$(DECODER_SRCDIR)/./core/asm/dct.asm\
-	$(DECODER_SRCDIR)/./core/asm/deblock.asm\
-	$(DECODER_SRCDIR)/./core/asm/expand_picture.asm\
 	$(DECODER_SRCDIR)/./core/asm/intra_pred.asm\
-	$(DECODER_SRCDIR)/./core/asm/mb_copy.asm\
-	$(DECODER_SRCDIR)/./core/asm/mc_chroma.asm\
-	$(DECODER_SRCDIR)/./core/asm/mc_luma.asm\
-	$(DECODER_SRCDIR)/./core/asm/memzero.asm\
 
 DECODER_OBJS += $(DECODER_ASM_SRCS:.asm=.o)
 endif
@@ -113,38 +105,14 @@
 $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.o: $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.cpp
 	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c -o $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.o $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.cpp
 
-$(DECODER_SRCDIR)/./core/asm/asm_inc.o: $(DECODER_SRCDIR)/./core/asm/asm_inc.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/asm_inc.o $(DECODER_SRCDIR)/./core/asm/asm_inc.asm
-
 $(DECODER_SRCDIR)/./core/asm/block_add.o: $(DECODER_SRCDIR)/./core/asm/block_add.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/block_add.o $(DECODER_SRCDIR)/./core/asm/block_add.asm
 
-$(DECODER_SRCDIR)/./core/asm/cpuid.o: $(DECODER_SRCDIR)/./core/asm/cpuid.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/cpuid.o $(DECODER_SRCDIR)/./core/asm/cpuid.asm
-
 $(DECODER_SRCDIR)/./core/asm/dct.o: $(DECODER_SRCDIR)/./core/asm/dct.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/dct.o $(DECODER_SRCDIR)/./core/asm/dct.asm
 
-$(DECODER_SRCDIR)/./core/asm/deblock.o: $(DECODER_SRCDIR)/./core/asm/deblock.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/deblock.o $(DECODER_SRCDIR)/./core/asm/deblock.asm
-
-$(DECODER_SRCDIR)/./core/asm/expand_picture.o: $(DECODER_SRCDIR)/./core/asm/expand_picture.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/expand_picture.o $(DECODER_SRCDIR)/./core/asm/expand_picture.asm
-
 $(DECODER_SRCDIR)/./core/asm/intra_pred.o: $(DECODER_SRCDIR)/./core/asm/intra_pred.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/intra_pred.o $(DECODER_SRCDIR)/./core/asm/intra_pred.asm
-
-$(DECODER_SRCDIR)/./core/asm/mb_copy.o: $(DECODER_SRCDIR)/./core/asm/mb_copy.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/mb_copy.o $(DECODER_SRCDIR)/./core/asm/mb_copy.asm
-
-$(DECODER_SRCDIR)/./core/asm/mc_chroma.o: $(DECODER_SRCDIR)/./core/asm/mc_chroma.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/mc_chroma.o $(DECODER_SRCDIR)/./core/asm/mc_chroma.asm
-
-$(DECODER_SRCDIR)/./core/asm/mc_luma.o: $(DECODER_SRCDIR)/./core/asm/mc_luma.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/mc_luma.o $(DECODER_SRCDIR)/./core/asm/mc_luma.asm
-
-$(DECODER_SRCDIR)/./core/asm/memzero.o: $(DECODER_SRCDIR)/./core/asm/memzero.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/memzero.o $(DECODER_SRCDIR)/./core/asm/memzero.asm
 
 $(LIBPREFIX)decoder.$(LIBSUFFIX): $(DECODER_OBJS)
 	rm -f $(LIBPREFIX)decoder.$(LIBSUFFIX)
--- a/codec/encoder/core/asm/asm_inc.asm
+++ /dev/null
@@ -1,235 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  sse2inc.asm
-;*
-;*  Abstract
-;*      macro and constant
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
-	%define MOVDQ movdqa
-%else
-	%define MOVDQ movdqu
-%endif
-
-%if 1
-	%define WELSEMMS	emms
-%else
-	%define WELSEMMS
-%endif
-
-BITS 32
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-%macro WELS_EXTERN 1
-	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
-	%else
-		global %1
-	%endif
-%endmacro
-
-%macro WELS_AbsW 2
-	pxor        %2, %2
-    psubw       %2, %1
-    pmaxsw      %1, %2
-%endmacro
-
-%macro MMX_XSwap  4
-    movq		%4, %2
-    punpckh%1   %4, %3
-    punpckl%1   %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
-    MMX_XSwap wd, %1, %2, %5
-    MMX_XSwap wd, %3, %4, %2
-    MMX_XSwap dq, %1, %3, %4
-    MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
-    movdqa      %4, %2
-    punpckl%1   %2, %3
-    punpckh%1   %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
-    SSE2_XSawp dq,  %1, %2, %5
-    SSE2_XSawp dq,  %3, %4, %2
-    SSE2_XSawp qdq, %1, %3, %4
-    SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
-    SSE2_XSawp wd,  %1, %2, %5
-    SSE2_XSawp wd,  %3, %4, %2
-    SSE2_XSawp dq,  %1, %3, %4
-    SSE2_XSawp dq,  %5, %2, %3
-    SSE2_XSawp qdq, %1, %5, %2
-    SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in:  m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
-	movdqa	%9,	%8
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%9, %4
-	SSE2_XSawp bw,  %7, %6, %4
-
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %3
-	SSE2_XSawp wd,  %7, %4, %3
-
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %5
-	SSE2_XSawp dq,  %7, %3, %5
-
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
-    movq         %1, %4
-    punpcklbw    %1, %3
-    movq         %2, %5
-    punpcklbw    %2, %3
-    psubw        %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
-	movdqa  %3, %2
-    paddw   %2, %1
-    psubw   %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro  SSE2_Copy8Times 2
-		movd	%1, %2
-		punpcklwd %1, %1
-		pshufd	%1,	%1,	0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro  SSE2_Copy16Times 2
-		movd		%1, %2
-		pshuflw		%1, %1, 0
-		punpcklqdq	%1, %1
-		packuswb	%1,	%1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro	WELS_Zero 1
-	pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
-	pcmpeqw %1,%1
-	psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -44,7 +44,7 @@
 
 
 
-
+%ifdef X86_32
 SECTION .rodata align=16
 
 align 16
@@ -456,4 +456,5 @@
 	pop esi
 	pop edi
 	pop ebx
-	ret
\ No newline at end of file
+	ret
+%endif
--- a/codec/encoder/core/asm/cpuid.asm
+++ /dev/null
@@ -1,169 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	cpu_mmx.asm
-;*
-;*  Abstract
-;*		verify cpuid feature support and cpuid detection
-;*
-;*  History
-;*      04/29/2009	Created
-;*
-;*************************************************************************/
-
-bits 32
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-%macro WELS_EXTERN 1
-	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
-	%else
-		global %1
-	%endif
-%endmacro
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
-;******************************************************************************************
-;   int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WelsCPUIdVerify:
-    pushfd					; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
-	pushfd					; need push 2 EFLAGS, one for processing and the another one for storing purpose
-    pop     ecx				; get EFLAGS to bit manipulation
-    mov     eax, ecx		; store into ecx followed
-    xor     eax, 00200000h	; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
-	xor		eax, ecx		; get the ID flag bitwise, eax - 0: not support; otherwise: support
-    popfd					; store back EFLAGS and keep unchanged for system
-    ret
-
-WELS_EXTERN WelsCPUId
-ALIGN 16
-;****************************************************************************************************
-;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
-;****************************************************************************************************
-WelsCPUId:
-	push	ebx
-	push	edi
-
-	mov     eax, [esp+12]	; operating index
-    cpuid					; cpuid
-
-	; processing various information return
-	mov     edi, [esp+16]
-    mov     [edi], eax
-    mov     edi, [esp+20]
-    mov     [edi], ebx
-    mov     edi, [esp+24]
-    mov     [edi], ecx
-    mov     edi, [esp+28]
-    mov     [edi], edx
-
-	pop		edi
-    pop     ebx
-	ret
-
-WELS_EXTERN WelsCPUSupportAVX
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportAVX:
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-
-	; refer to detection of AVX addressed in INTEL AVX manual document
-	and ecx, 018000000H
-	cmp ecx, 018000000H		; check both OSXSAVE and AVX feature flags
-	jne avx_not_supported
-	; processor supports AVX instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne avx_not_supported
-	mov eax, 1
-	ret
-avx_not_supported:
-	mov eax, 0
-	ret
-
-WELS_EXTERN WelsCPUSupportFMA
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportFMA:
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-
-	; refer to detection of FMA addressed in INTEL AVX manual document
-	and ecx, 018001000H
-	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
-	jne fma_not_supported
-	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne fma_not_supported
-	mov eax, 1
-	ret
-fma_not_supported:
-	mov eax, 0
-	ret
-
-WELS_EXTERN WelsEmms
-ALIGN 16
-;******************************************************************************************
-;   void WelsEmms()
-;******************************************************************************************
-WelsEmms:
-	emms	; empty mmx technology states
-	ret
-
-
-
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -42,8 +42,6 @@
 
 %include "asm_inc.asm"
 
-BITS 32
-
 SECTION .rodata align=16
 
 ;***********************************************************************
@@ -131,7 +129,7 @@
     packuswb   %1, %2
     movd       %5, %1
 %endmacro
-
+SECTION .text
 ALIGN 16
 ;***********************************************************************
 ;   void __cdecl WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
@@ -138,15 +136,20 @@
 ;***********************************************************************
 WELS_EXTERN WelsDctT4_mmx
 WelsDctT4_mmx:
-    push    ebx
-    mov     eax, [esp+12]   ; pix1
-    mov     ebx, [esp+16]   ; i_pix1
-    mov     ecx, [esp+20]   ; pix2
-    mov     edx, [esp+24]   ; i_pix2
-
+    ;push    ebx
+    ;mov     eax, [esp+12]   ; pix1
+    ;mov     ebx, [esp+16]   ; i_pix1
+    ;mov     ecx, [esp+20]   ; pix2
+    ;mov     edx, [esp+24]   ; i_pix2
+    %assign push_num 0
+    LOAD_5_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	movsx r4, r4d
+	%endif
     WELS_Zero    mm7
 
-    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
+    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
 
     MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
     MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
@@ -154,14 +157,14 @@
     MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
     MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
 
-    mov     eax, [esp+ 8]   ; pDct
-    movq    [eax+ 0],   mm2
-    movq    [eax+ 8],   mm1
-    movq    [eax+16],   mm5
-    movq    [eax+24],   mm4
-
-	WELSEMMS
-    pop     ebx
+    ;mov     eax, [esp+ 8]   ; pDct
+    movq    [r0+ 0],   mm2
+    movq    [r0+ 8],   mm1
+    movq    [r0+16],   mm5
+    movq    [r0+24],   mm4
+    WELSEMMS
+    LOAD_5_PARA_POP
+    ;pop     ebx
     ret
 
 
@@ -170,24 +173,29 @@
 ;***********************************************************************
 WELS_EXTERN WelsIDctT4Rec_mmx
 WelsIDctT4Rec_mmx:
-	push   ebx
-%define	pushsize	4
-%define     p_dst       esp+pushsize+4
-%define     i_dst       esp+pushsize+8
-%define     p_pred      esp+pushsize+12
-%define     i_pred      esp+pushsize+16
-%define     pDct        esp+pushsize+20
+	;push   ebx
+;%define	pushsize	4
+;%define     p_dst       esp+pushsize+4
+;%define     i_dst       esp+pushsize+8
+;%define     p_pred      esp+pushsize+12
+;%define     i_pred      esp+pushsize+16
+;%define     pDct        esp+pushsize+20
+    %assign push_num 0
+    LOAD_5_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	movsx r3, r3d
+	%endif
+;	mov     eax, [pDct   ] 
+    movq    mm0, [r4+ 0]
+    movq    mm1, [r4+ 8]
+    movq    mm2, [r4+16]
+    movq    mm3, [r4+24]
+    ;mov     edx, [p_dst ] ; r0
+    ;mov     ecx, [i_dst ] ; r1
+    ;mov     eax, [p_pred] ; r2
+    ;mov     ebx, [i_pred] ; r3
 
-	mov     eax, [pDct   ]
-    movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
-    mov     edx, [p_dst ]
-    mov     ecx, [i_dst ]
-    mov     eax, [p_pred]
-    mov     ebx, [i_pred]
-
 	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
 	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
     MMX_Trans4x4W		mm1, mm3, mm0, mm4, mm2
@@ -196,21 +204,22 @@
     WELS_Zero			mm7
     WELS_DW32			mm6
 
-    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [edx], [eax]
-    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-    lea     edx, [edx+2*ecx]
-    lea     eax, [eax+2*ebx]
-    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [edx], [eax]
-    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
+    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+    lea     r0, [r0+2*r1]
+    lea     r2, [r2+2*r3]
+    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
 
 	WELSEMMS
-%undef	pushsize
-%undef  p_dst
-%undef  i_dst
-%undef  p_pred
-%undef  i_pred
-%undef  pDct
-    pop ebx
+    LOAD_5_PARA_POP
+;%undef	pushsize
+;%undef  p_dst
+;%undef  i_dst
+;%undef  p_pred
+;%undef  i_pred
+;%undef  pDct
+;    pop ebx
     ret
 
 
@@ -314,23 +323,27 @@
 WELS_EXTERN WelsDctFourT4_sse2
 ALIGN 16
 WelsDctFourT4_sse2:
-    push    ebx
-    push	esi
-    mov		esi, [esp+12]
-    mov     eax, [esp+16]   ; pix1
-    mov     ebx, [esp+20]   ; i_pix1
-    mov     ecx, [esp+24]   ; pix2
-    mov     edx, [esp+28]   ; i_pix2
-
+    ;push    ebx
+    ;push	esi
+    ;mov		esi, [esp+12]
+    ;mov     eax, [esp+16]   ; pix1
+    ;mov     ebx, [esp+20]   ; i_pix1
+    ;mov     ecx, [esp+24]   ; pix2
+    ;mov     edx, [esp+28]   ; i_pix2
+    %assign push_num 0
+    LOAD_5_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	movsx r4, r4d
+	%endif
     pxor    xmm7, xmm7
-
 	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx]
-    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
-	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]
-	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
-    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
+	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
+    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
+	lea		r1, [r1 + 2 * r2]
+	lea		r3, [r3 + 2 * r4]
+	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
+    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
 	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
@@ -337,18 +350,18 @@
 	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
 
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
-	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]
+	lea		r1, [r1 + 2 * r2]
+	lea		r3, [r3 + 2 * r4]
 
 	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx    ]
-    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
-	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]
-    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
-    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
+	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
+    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
+	lea		r1, [r1 + 2 * r2]
+	lea		r3, [r3 + 2 * r4]
+    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
+    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
 	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
@@ -355,19 +368,20 @@
     SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
 
-	lea		esi, [esi+64]
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+	lea		r0, [r0+64]
+	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
-    pop esi
-    pop ebx
+    ;pop esi
+    ;pop ebx
+	LOAD_5_PARA_POP
     ret
 
 
-%define		rec			esp + pushsize + 4
-%define		stride		esp + pushsize + 8
-%define		pred		esp + pushsize + 12
-%define		pred_stride	esp + pushsize + 16
-%define		rs			esp + pushsize + 20
+;%define		rec			esp + pushsize + 4
+;%define		stride		esp + pushsize + 8
+;%define		pred		esp + pushsize + 12
+;%define		pred_stride	esp + pushsize + 16
+;%define		rs			esp + pushsize + 20
 ;***********************************************************************
 ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
 ;***********************************************************************
@@ -374,18 +388,23 @@
 WELS_EXTERN WelsIDctFourT4Rec_sse2
 ALIGN 16
 WelsIDctFourT4Rec_sse2:
-%define	pushsize	8
-    push		ebx
-    push		esi
+;%define	pushsize	8
+;    push		ebx
+;    push		esi
 
-    mov			eax,		[rec]
-    mov			ebx,		[stride]
-    mov			ecx,		[pred]
-    mov			edx,		[pred_stride]
-    mov			esi,		[rs]
-
+;    mov			eax,		[rec]
+;    mov			ebx,		[stride]
+;    mov			ecx,		[pred]
+;    mov			edx,		[pred_stride]
+;    mov			esi,		[rs]
+	%assign push_num 0
+	LOAD_5_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	movsx r3, r3d
+	%endif
 	;Load 4x8
-	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
 	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
   	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
@@ -395,17 +414,17 @@
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
 
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
-	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
+	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
+	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+	lea		r0, [r0 + 2 * r1]
+	lea		r2, [r2 + 2 * r3]
+	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
 
-    add		esi, 64
-	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]
-   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+    add		r4, 64
+	lea		r0, [r0 + 2 * r1]
+	lea		r2, [r2 + 2 * r3]
+   	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
 	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
 	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
@@ -415,15 +434,15 @@
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
 
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
-	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx]
-
-    pop		esi
-    pop		ebx
+	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
+	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+	lea		r0, [r0 + 2 * r1]
+	lea		r2, [r2 + 2 * r3]
+	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
+	LOAD_5_PARA_POP
+   ; pop		esi
+   ; pop		ebx
     ret
 
   %macro SSE2_StoreDiff4x8p 8
@@ -438,54 +457,60 @@
 ;***********************************************************************
 WELS_EXTERN WelsIDctRecI16x16Dc_sse2
 ALIGN 16
-%define		pushsize	8
-%define		luma_dc		esp + pushsize + 20
+;%define		pushsize	8
+;%define		luma_dc		esp + pushsize + 20
 WelsIDctRecI16x16Dc_sse2:
-    push		esi
-    push		edi
+	%assign push_num 0
+	LOAD_5_PARA
+	%ifndef X86_32
+	movsx r1, r1d
+	movsx r3, r3d
+	%endif
+   ; push		esi
+   ; push		edi
 
-	mov			ecx,		[luma_dc]
-    mov			eax,		[rec]
-    mov			edx,		[stride]
-    mov			esi,		[pred]
-    mov			edi,		[pred_stride]
+   ;mov			ecx,		[luma_dc] ; r4
+    ;mov			eax,		[rec] ; r0
+    ;mov			edx,		[stride] ; r1
+    ;mov			esi,		[pred]; r2
+    ;mov			edi,		[pred_stride]; r3
 	pxor		xmm7,		xmm7
     WELS_DW32	xmm6
 
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
-    pop		edi
-    pop		esi
+	lea			r0,		[r0 + 2 * r1]
+	lea			r2,		[r2 + 2 * r3]
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+	LOAD_5_PARA_POP
+    ;pop		edi
+    ;pop		esi
     ret
 
 
@@ -503,17 +528,16 @@
 	movdqa	%4, %1
 	psubd	%4, %2
 %endmacro
-
  %macro		SSE2_Load4Col	5
-	movsx		edx,		WORD[%5]
- 	movd		%1,			edx
- 	movsx		edx,		WORD[%5 + 0x20]
- 	movd		%2,			edx
+	movsx		r2,		WORD[%5]
+ 	movd		%1,			r2d
+ 	movsx		r2,		WORD[%5 + 0x20]
+ 	movd		%2,			r2d
 	punpckldq	%1,			%2
-	movsx		edx,		WORD[%5 + 0x80]
- 	movd		%3,			edx
-	movsx		edx,		WORD[%5 + 0xa0]
- 	movd		%4,			edx
+	movsx		r2,		WORD[%5 + 0x80]
+ 	movd		%3,			r2d
+	movsx		r2,		WORD[%5 + 0xa0]
+ 	movd		%4,			r2d
 	punpckldq	%3,			%4
 	punpcklqdq	%1,			%3
  %endmacro
@@ -523,14 +547,15 @@
 ;***********************************************************************
 WELS_EXTERN WelsHadamardT4Dc_sse2
 WelsHadamardT4Dc_sse2:
-		mov			eax,		[esp + 4]	; luma_dc
-		mov			ecx,		[esp + 8]	; pDct
+		;mov			eax,		[esp + 4]	; luma_dc
+		;mov			ecx,		[esp + 8]	; pDct
+		%assign push_num 0
+		LOAD_2_PARA
+		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
+		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, r1 + 0x140
 
-		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, ecx
-		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, ecx + 0x40
-		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, ecx + 0x100
-		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-
 		SSE2_SumSubD		xmm1, xmm2, xmm7
 		SSE2_SumSubD		xmm3, xmm4, xmm7
 		SSE2_SumSubD		xmm2, xmm4, xmm7
@@ -548,9 +573,7 @@
 
 		packssdw	xmm3,	xmm4
 		packssdw	xmm2,	xmm1
-		movdqa	[eax+ 0],   xmm3
-		movdqa	[eax+16],   xmm2
+		movdqa	[r0+ 0],   xmm3
+		movdqa	[r0+16],   xmm2
 
 		ret
-
-
--- a/codec/encoder/core/asm/deblock.asm
+++ /dev/null
@@ -1,2113 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,68h
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx]
-  movq        xmm5,[edx+ecx]
-  push        esi
-  push        edi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  movq        xmm1,[edi]
-  mov         edi,ecx
-  sub         edi,esi
-  movq        xmm2,[edi]
-  punpcklqdq  xmm1,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm2,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm3,[edi]
-  punpcklqdq  xmm2,xmm3
-  movq        xmm3,[eax]
-  punpcklqdq  xmm3,xmm4
-  movq        xmm4,[edx+eax]
-  mov       edx, [ebp + 14h]
-  punpcklqdq  xmm4,xmm5
-  movd        xmm5,edx
-  mov       edx, [ebp + 18h]
-  pxor        xmm0,xmm0
-  movdqa      xmm6,xmm5
-  punpcklwd   xmm6,xmm5
-  pshufd      xmm5,xmm6,0
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,xmm1
-  punpckhbw   xmm1,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+40h],xmm1
-  movdqa      [esp+60h],xmm7
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+10h],xmm7
-  movdqa      xmm7,xmm3
-  punpcklbw   xmm7,xmm0
-  punpckhbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm7,xmm4
-  punpckhbw   xmm4,xmm0
-  punpckhbw   xmm2,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+30h],xmm3
-  movdqa      xmm3,[esp+10h]
-  movdqa      xmm1,xmm3
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      [esp+20h],xmm4
-  movdqa      xmm0,xmm5
-  pcmpgtw     xmm0,xmm1
-  movdqa      xmm1,[esp+60h]
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  pand        xmm0,xmm4
-  movdqa      xmm1,xmm7
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,xmm2
-  psubw       xmm1,[esp+30h]
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  pand        xmm0,xmm4
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,[esp+20h]
-  psubw       xmm1,[esp+30h]
-  pand        xmm5,xmm4
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  pand        xmm5,xmm6
-  mov         edx,2
-  movsx       edx,dx
-  movd        xmm1,edx
-  movdqa      xmm4,xmm1
-  punpcklwd   xmm4,xmm1
-  pshufd      xmm1,xmm4,0
-  movdqa      xmm4,[esp+60h]
-  movdqa      xmm6,xmm4
-  paddw       xmm6,xmm4
-  paddw       xmm6,xmm3
-  paddw       xmm6,xmm7
-  movdqa      [esp+10h],xmm1
-  paddw       xmm6,[esp+10h]
-  psraw       xmm6,2
-  movdqa      xmm4,xmm0
-  pandn       xmm4,xmm3
-  movdqa      xmm3,[esp+40h]
-  movdqa      xmm1,xmm0
-  pand        xmm1,xmm6
-  por         xmm1,xmm4
-  movdqa      xmm6,xmm3
-  paddw       xmm6,xmm3
-  movdqa      xmm3,[esp+10h]
-  paddw       xmm6,xmm2
-  paddw       xmm6,[esp+20h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm4,xmm5
-  pand        xmm4,xmm6
-  movdqa      xmm6,xmm5
-  pandn       xmm6,xmm2
-  por         xmm4,xmm6
-  packuswb    xmm1,xmm4
-  movdqa      xmm4,[esp+50h]
-  movdqa      xmm6,xmm7
-  paddw       xmm6,xmm7
-  paddw       xmm6,xmm4
-  paddw       xmm6,[esp+60h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm2,xmm0
-  pand        xmm2,xmm6
-  pandn       xmm0,xmm4
-  por         xmm2,xmm0
-  movdqa      xmm0,[esp+20h]
-  movdqa      xmm6,xmm0
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[esp+30h]
-  paddw       xmm6,xmm0
-  paddw       xmm6,[esp+40h]
-  movdqa      xmm4,xmm5
-  paddw       xmm6,xmm3
-  movq        [esi],xmm1
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  pandn       xmm5,xmm0
-  por         xmm4,xmm5
-  packuswb    xmm2,xmm4
-  movq        [eax],xmm2
-  psrldq      xmm1,8
-  movq        [edi],xmm1
-  pop         edi
-  psrldq      xmm2,8
-  movq        [ecx],xmm2
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0E4h
-  push        ebx
-  push        esi
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2]
-  push        edi
-  movsx       di,byte [esi+3]
-  mov         word [esp+0Ch],bx
-  movsx       bx,byte  [esi+1]
-  movsx       esi,byte  [esi]
-  mov         word  [esp+0Eh],si
-  movzx       esi,di
-  movd        xmm1,esi
-  movzx       esi,di
-  movd        xmm2,esi
-  mov         si,word  [esp+0Ch]
-  mov         edx, [ebp + 10h]
-  mov         eax, [ebp + 08h]
-  movzx       edi,si
-  movzx       esi,si
-  mov         ecx, [ebp + 0Ch]
-  movd        xmm4,esi
-  movzx       esi,bx
-  movd        xmm5,esi
-  movd        xmm3,edi
-  movzx       esi,bx
-  movd        xmm6,esi
-  mov         si,word [esp+0Eh]
-  movzx       edi,si
-  movzx       esi,si
-  punpcklwd   xmm6,xmm2
-  pxor        xmm0,xmm0
-  movdqa      [esp+40h],xmm0
-  movd        xmm7,edi
-  movd        xmm0,esi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  punpcklwd   xmm0,xmm4
-  movq        xmm4,[edx+ecx]
-  punpcklwd   xmm7,xmm3
-  movq        xmm3,[eax]
-  punpcklwd   xmm0,xmm6
-  movq        xmm6,[edi]
-  punpcklwd   xmm7,xmm5
-  punpcklwd   xmm0,xmm7
-  mov         edi,ecx
-  sub         edi,esi
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+60h],xmm2
-  movq        xmm2, [edi]
-  punpcklqdq  xmm6,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm7,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm2,[edi]
-  punpcklqdq  xmm7,xmm2
-  movq        xmm2,[ecx]
-  punpcklqdq  xmm3,xmm2
-  movq        xmm2,[edx+eax]
-  movsx       edx,word [ebp + 14h]
-  punpcklqdq  xmm2,xmm4
-  movdqa      [esp+0E0h],xmm2
-  movd        xmm2,edx
-  movsx       edx,word [ebp + 18h]
-  movdqa      xmm4,xmm2
-  punpcklwd   xmm4,xmm2
-  movd        xmm2,edx
-  movdqa      xmm5,xmm2
-  punpcklwd   xmm5,xmm2
-  pshufd      xmm2,xmm5,0
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  movdqa      [esp+0D0h],xmm3
-  pshufd      xmm4,xmm4,0
-  movdqa      [esp+30h],xmm2
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+80h],xmm6
-  movdqa      xmm6,[esp+0D0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+70h],xmm6
-  movdqa      xmm6, [esp+0E0h]
-  punpckhbw   xmm6,xmm1
-  movdqa     [esp+90h],xmm6
-  movdqa      xmm5, [esp+0E0h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa       [esp+0A0h],xmm7
-  punpcklbw   xmm3,xmm1
-  mov         edx,4
-  punpcklbw   xmm2,xmm1
-  movsx       edx,dx
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,[esp+30h]
-  movdqa      [esp+20h],xmm6
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1,[esp+60h]
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6, [esp+20h]
-  movdqa      xmm7, [esp+50h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      [esp+10h],xmm0
-  movdqa      xmm6, [esp+10h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+10h],xmm6
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm6,xmm4
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+30h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1,[esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5,[esp+80h]
-  psubw       xmm5,[esp+90h]
-  pand        xmm6,xmm1
-  pand        xmm6,[esp+40h]
-  movdqa      xmm1,[esp+10h]
-  pand        xmm1,xmm6
-  movdqa      xmm6,[esp+70h]
-  movdqa      [esp+30h],xmm1
-  movdqa      xmm1,[esp+0A0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6,[esp+20h]
-  movdqa      xmm5,[esp+60h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+70h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+80h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+90h]
-  pand        xmm4,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+40h]
-  pand        xmm0,xmm4
-  movdqa      xmm4,[esp+30h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  packuswb    xmm2,xmm1
-  movq        [esi],xmm2
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm3,xmm5
-  movq        [eax],xmm3
-  psrldq      xmm2,8
-  movq        [edi],xmm2
-  pop         edi
-  pop         esi
-  psrldq      xmm3,8
-  movq        [ecx],xmm3
-  pop         ebx
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-
-DeblockChromaEq4H_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0C8h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+7Ch]
-  push        edi
-  mov         dword [esp+14h],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+0Ch],edx
-  mov         dword [esp+10h],eax
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword  [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+0Ch]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+10h]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  movsx       ecx,word [ebp+14h]
-  movsx       edx,word [ebp+18h]
-  movdqa      xmm6,[esp+80h]
-  movdqa      xmm4,[esp+90h]
-  movdqa      xmm5,[esp+0A0h]
-  movdqa      xmm7,[esp+0B0h]
-  pxor        xmm0,xmm0
-  movd        xmm1,ecx
-  movdqa      xmm2,xmm1
-  punpcklwd   xmm2,xmm1
-  pshufd      xmm1,xmm2,0
-  movd        xmm2,edx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3,xmm6
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm6,[esp+90h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm6,[esp+0A0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,[esp+0B0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+70h],xmm6
-  punpcklbw   xmm7,xmm0
-  punpcklbw   xmm4,xmm0
-  punpcklbw   xmm5,xmm0
-  punpcklbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm6,xmm4
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  movdqa      xmm0,xmm1
-  pcmpgtw     xmm0,xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm4
-  pabsw       xmm6,xmm6
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+30h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm1,xmm6
-  movdqa      xmm6,[esp+60h]
-  psubw       xmm6,[esp+30h]
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+70h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pand        xmm1,xmm7
-  pcmpgtw     xmm2,xmm6
-  pand        xmm1,xmm2
-  mov         eax,2
-  movsx       ecx,ax
-  movd        xmm2,ecx
-  movdqa      xmm6,xmm2
-  punpcklwd   xmm6,xmm2
-  pshufd      xmm2,xmm6,0
-  movdqa      [esp+20h],xmm2
-  movdqa      xmm2,xmm3
-  paddw       xmm2,xmm3
-  paddw       xmm2,xmm4
-  paddw       xmm2,[esp+50h]
-  paddw       xmm2,[esp+20h]
-  psraw       xmm2,2
-  movdqa      xmm6,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm2,xmm0
-  pandn       xmm2,xmm4
-  por         xmm6,xmm2
-  movdqa      xmm2,[esp+60h]
-  movdqa      xmm7,xmm2
-  paddw       xmm7,xmm2
-  paddw       xmm7,[esp+30h]
-  paddw       xmm7,[esp+70h]
-  paddw       xmm7,[esp+20h]
-  movdqa      xmm4,xmm1
-  movdqa      xmm2,xmm1
-  pandn       xmm2,[esp+30h]
-  psraw       xmm7,2
-  pand        xmm4,xmm7
-  por         xmm4,xmm2
-  movdqa      xmm2,[esp+50h]
-  packuswb    xmm6,xmm4
-  movdqa      [esp+90h],xmm6
-  movdqa      xmm6,xmm2
-  paddw       xmm6,xmm2
-  movdqa      xmm2,[esp+20h]
-  paddw       xmm6,xmm5
-  paddw       xmm6,xmm3
-  movdqa      xmm4,xmm0
-  pandn       xmm0,xmm5
-  paddw       xmm6,xmm2
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  por         xmm4,xmm0
-  movdqa      xmm0,[esp+70h]
-  movdqa      xmm5,xmm0
-  paddw       xmm5,xmm0
-  movdqa      xmm0,[esp+40h]
-  paddw       xmm5,xmm0
-  paddw       xmm5,[esp+60h]
-  movdqa      xmm3,xmm1
-  paddw       xmm5,xmm2
-  psraw       xmm5,2
-  pand        xmm3,xmm5
-  pandn       xmm1,xmm0
-  por         xmm3,xmm1
-  packuswb    xmm4,xmm3
-  movdqa      [esp+0A0h],xmm4
-  mov         esi,dword [esp+10h]
-  movdqa      xmm0,[esi]
-  movdqa      xmm1,[esi+10h]
-  movdqa      xmm2,[esi+20h]
-  movdqa      xmm3,[esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+0Ch]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4H_sse2
-
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,108h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+10h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+6Ch]
-  push        edi
-  mov         dword [esp+0Ch],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+10h],edx
-  mov         dword [esp+1Ch],eax
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+10h]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+1Ch]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  mov         eax,dword [ebp+1Ch]
-  movsx       cx,byte [eax+3]
-  movsx       dx,byte [eax+2]
-  movsx       si,byte [eax+1]
-  movsx       ax,byte [eax]
-  movzx       edi,cx
-  movzx       ecx,cx
-  movd        xmm2,ecx
-  movzx       ecx,dx
-  movzx       edx,dx
-  movd        xmm3,ecx
-  movd        xmm4,edx
-  movzx       ecx,si
-  movzx       edx,si
-  movd        xmm5,ecx
-  pxor        xmm0,xmm0
-  movd        xmm6,edx
-  movzx       ecx,ax
-  movdqa      [esp+60h],xmm0
-  movzx       edx,ax
-  movsx       eax,word [ebp+14h]
-  punpcklwd   xmm6,xmm2
-  movd        xmm1,edi
-  movd        xmm7,ecx
-  movsx       ecx,word [ebp+18h]
-  movd        xmm0,edx
-  punpcklwd   xmm7,xmm3
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+60h]
-  punpcklwd   xmm7,xmm5
-  movdqa      xmm5,[esp+0A0h]
-  punpcklwd   xmm0,xmm4
-  punpcklwd   xmm0,xmm6
-  movdqa      xmm6, [esp+70h]
-  punpcklwd   xmm0,xmm7
-  movdqa      xmm7,[esp+80h]
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+0D0h],xmm2
-  movd        xmm2,eax
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm4,xmm3,0
-  movd        xmm2,ecx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3, [esp+90h]
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+40h],xmm2
-  movdqa      [esp+0B0h],xmm6
-  movdqa      xmm6,[esp+90h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm2,xmm1
-  punpcklbw   xmm3,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa      [esp+0F0h],xmm7
-  movdqa      [esp+0C0h],xmm6
-  movdqa      xmm6, [esp+0A0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+0E0h],xmm6
-  mov         edx,4
-  movsx       eax,dx
-  movd        xmm6,eax
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm7, [esp+40h]
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm1, [esp+0D0h]
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6,[esp+30h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      xmm7,[esp+50h]
-  movdqa      [esp+20h],xmm0
-  movdqa      xmm6, [esp+20h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+20h],xmm6
-  movdqa      xmm6,xmm4
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+40h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1, [esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5, [esp+0B0h]
-  psubw       xmm5,[esp+0E0h]
-  pand        xmm6,xmm1
-  pand        xmm6, [esp+60h]
-  movdqa      xmm1, [esp+20h]
-  pand        xmm1,xmm6
-  movdqa      xmm6, [esp+0C0h]
-  movdqa      [esp+40h],xmm1
-  movdqa      xmm1, [esp+0F0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6, [esp+30h]
-  movdqa      xmm5, [esp+0D0h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+0C0h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+0B0h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6, [esp+0E0h]
-  pand        xmm4,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+60h]
-  pand        xmm0,xmm4
-  movdqa      xmm4, [esp+40h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm2,xmm1
-  packuswb    xmm3,xmm5
-  movdqa      [esp+80h],xmm2
-  movdqa      [esp+90h],xmm3
-  mov         esi,dword [esp+1Ch]
-  movdqa      xmm0, [esi]
-  movdqa      xmm1, [esi+10h]
-  movdqa      xmm2, [esi+20h]
-  movdqa      xmm3, [esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+10h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-
-
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h
-
-    mov     eax,   [ebp + 0Ch]
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-
-    movq    xmm0,  [eax]
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2]
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax]
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2]
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0
-
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-
-
-
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h
-
-    mov      eax,   [ebp + 10h]
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
-    lea      eax,   [ecx * 3]
-
-    movq     [edx],  xmm4
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0
-
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0
-
-
-    mov      esp,   ebp
-    pop      ebp
-    ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ /dev/null
@@ -1,653 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  expand_picture.asm
-;*
-;*  Abstract
-;*      mmxext/sse for expand_frame
-;*
-;*  History
-;*      09/25/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-;%define PADDING_SIZE_ASM 	32 	; PADDING_LENGTH
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-
-SECTION .text
-
-WELS_EXTERN ExpandPictureLuma_sse2
-WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
-WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
-
-;;;;;;;expanding result;;;;;;;
-
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;----------------------------
-;aaaa|attttttttttttttttb|bbbb
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;----------------------------
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-
-%macro mov_line_8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
-	; ebx [width/16(8)]
-	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
-	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-
-%if %1 == 32		; for luma
-	sar ebx, 04h 	; width / 16(8) pixels
-.top_bottom_loops:
-	; top
-	movdqa xmm0, [esi]		; first line of picture pData
-	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
-	; bottom
-	movdqa xmm1, [eax] 		; last line of picture pData
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
-	lea esi, [esi+16]		; top pSrc
-	lea edi, [edi+16]		; top dst
-	lea eax, [eax+16]		; bottom pSrc
-	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?
-
-	dec ebx
-	jnz near .top_bottom_loops
-%elif %1 == 16	; for chroma ??
-	mov edx, ebx
-	sar ebx, 04h 	; (width / 16) pixels
-.top_bottom_loops:
-	; top
-	movdqa xmm0, [esi]		; first line of picture pData
-	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
-	; bottom
-	movdqa xmm1, [eax] 		; last line of picture pData
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
-	lea esi, [esi+16]		; top pSrc
-	lea edi, [edi+16]		; top dst
-	lea eax, [eax+16]		; bottom pSrc
-	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?
-
-	dec ebx
-	jnz near .top_bottom_loops
-
-	; for remaining 8 bytes
-	and edx, 0fh		; any 8 bytes left?
-	test edx, edx
-	jz near .to_be_continued	; no left to exit here
-
-	; top
-	movq mm0, [esi]		; remained 8 byte
-	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	mov_line_end8x4_mmx edi, ecx, mm0	; dst, stride, mm?
-	; bottom
-	movq mm1, [eax]
-	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	mov_line_end8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
-	WELSEMMS
-
-.to_be_continued:
-%endif
-%endmacro
-
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; ecx [height]
-	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
-	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
-;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32		; for luma
-.left_right_loops:
-	; left
-	mov al, byte [esi]		; pixel pData for left border
-	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0
-	movdqa [edi+16], xmm0
-
-	; right
-	mov al, byte [ebx]
-	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [ebp], xmm1
-	movdqa [ebp+16], xmm1
-
-	lea esi, [esi+edx]		; left pSrc
-	lea edi, [edi+edx]		; left dst
-	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst
-
-	dec ecx
-	jnz near .left_right_loops
-%elif %1 == 16	; for chroma ??
-.left_right_loops:
-	; left
-	mov al, byte [esi]		; pixel pData for left border
-	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0
-
-	; right
-	mov al, byte [ebx]
-	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-
-	lea esi, [esi+edx]		; left pSrc
-	lea edi, [edi+edx]		; left dst
-	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst
-
-	dec ecx
-	jnz near .left_right_loops
-%endif
-%endmacro
-
-%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
-	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
-%if %1 == 32		; luma
-	; TL
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-	mov_line_end32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
-
-	; TR
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-	mov_line_end32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
-
-	; BL
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-	mov_line_end32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
-
-	; BR
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-	mov_line_end32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
-%elif %1 == 16	; chroma
-	; TL
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-
-	; TR
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
-
-	; BL
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-
-	; BR
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-%endif
-%endmacro
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureLuma_sse2(	uint8_t *pDst,
-;									const int32_t iStride,
-;									const int32_t iWidth,
-;									const int32_t iHeight	);
-;***********************************************************************----------------
-ExpandPictureLuma_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	; for both top and bottom border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov edx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov eax, [esp+36]						; height
-	; also prepare for cross border pData top-left: xmm3
-;	xor ecx, ecx
-	mov cl, byte [esi]
-	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; load top border
-	mov ecx, edx							; stride
-	neg ecx 								; -stride
-	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border
-	dec eax									; h-1
-	imul eax, edx 							; (h-1)*stride
-	lea eax, [esi+eax]						; last line of picture pData
-	sal edx, 05h							; 32*stride
-	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
-	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
-	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]
-;	xor edx, edx
-	mov dl, byte [eax]						; bottom-left
-	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	mov dl, byte [ebx]						; bottom-right
-	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding
-	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	32
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst: left border pSrc
-	mov edx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov ecx, [esp+36]						; height
-	; load left border
-	mov eax, -32 							; luma=-32, chroma=-16
-	lea edi, [esi+eax]						; left border dst
-	dec ebx
-	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst
-	; prepare for cross border pData: top-right with xmm4
-;	xor eax, eax
-	mov al, byte [ebx]						; top-right
-	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for left & right border expanding
-	exp_left_right_sse2	32, a
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov ecx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov edx, [esp+36]						; height
-	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-	mov eax, -32							; luma=-32, chroma=-16
-	neg ecx										; -stride
-	lea edi, [esi+eax]
-	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]
-	lea ebp, [ebp+ecx]				; last line of top-right border
-	add edx, 32								; height+32(16), luma=32, chroma=16
-	mov ecx, [esp+28]					; stride
-	imul edx, ecx							; (height+32(16)) * stride
-	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border
-	neg ecx										; -stride
-	; for left & right border expanding
-	exp_cross_sse2		32, a
-
-;	sfence									; commit cache write back memory
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
-;***********************************************************************----------------
-ExpandPictureChromaAlign_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	; for both top and bottom border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov edx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov eax, [esp+36]						; height
-	; also prepare for cross border pData top-left: xmm3
-;	xor ecx, ecx
-	mov cl, byte [esi]
-	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; load top border
-	mov ecx, edx							; stride
-	neg ecx 								; -stride
-	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border
-	dec eax									; h-1
-	imul eax, edx 							; (h-1)*stride
-	lea eax, [esi+eax]						; last line of picture pData
-	sal edx, 04h							; 16*stride
-	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
-	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
-	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]
-;	xor edx, edx
-	mov dl, byte [eax]						; bottom-left
-	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	mov dl, byte [ebx]						; bottom-right
-	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding
-	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst: left border pSrc
-	mov edx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov ecx, [esp+36]						; height
-	; load left border
-	mov eax, -16 							; luma=-32, chroma=-16
-	lea edi, [esi+eax]						; left border dst
-	dec ebx
-	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst
-	; prepare for cross border pData: top-right with xmm4
-;	xor eax, eax
-	mov al, byte [ebx]						; top-right
-	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for left & right border expanding
-	exp_left_right_sse2	16, a
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov ecx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov edx, [esp+36]						; height
-	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-	mov eax, -16							; chroma=-16
-	neg ecx										; -stride
-	lea edi, [esi+eax]
-	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]
-	lea ebp, [ebp+ecx]				; last line of top-right border
-	mov ecx, [esp+28]						; stride
-	add edx, 16							; height+16, luma=32, chroma=16
-	imul edx, ecx							; (height+16) * stride
-	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border
-	neg ecx										; -stride
-	; for left & right border expanding
-	exp_cross_sse2		16, a
-
-;	sfence									; commit cache write back memory
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
-;***********************************************************************----------------
-ExpandPictureChromaUnalign_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	; for both top and bottom border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov edx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov eax, [esp+36]						; height
-	; also prepare for cross border pData top-left: xmm3
-;	xor ecx, ecx
-	mov cl, byte [esi]
-	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; load top border
-	mov ecx, edx							; stride
-	neg ecx 								; -stride
-	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border
-	dec eax									; h-1
-	imul eax, edx 							; (h-1)*stride
-	lea eax, [esi+eax]						; last line of picture pData
-	sal edx, 04h							; 16*stride
-	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
-	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
-	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]
-;	xor edx, edx
-	mov dl, byte [eax]						; bottom-left
-	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	mov dl, byte [ebx]						; bottom-right
-	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding
-	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst: left border pSrc
-	mov edx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov ecx, [esp+36]						; height
-	; load left border
-	mov eax, -16 							; luma=-32, chroma=-16
-	lea edi, [esi+eax]						; left border dst
-	dec ebx
-	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst
-	; prepare for cross border pData: top-right with xmm4
-;	xor eax, eax
-	mov al, byte [ebx]						; top-right
-	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for left & right border expanding
-	exp_left_right_sse2	16, u
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	mov esi, [esp+24]						; p_dst
-	mov ecx, [esp+28]						; stride
-	mov ebx, [esp+32]						; width
-	mov edx, [esp+36]						; height
-	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-	neg ecx									; -stride
-	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]
-	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]
-	lea ebp, [ebp+ecx]				; last line of top-right border
-	mov ecx, [esp+28]						; stride
-	add edx, 16							; height+16, luma=32, chroma=16
-	imul edx, ecx							; (height+16) * stride
-	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border
-	neg ecx									; -stride
-	; for left & right border expanding
-	exp_cross_sse2		16, u
-
-;	sfence									; commit cache write back memory
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-
-	ret
-
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -41,7 +41,6 @@
 ;*************************************************************************/
 %include "asm_inc.asm"
 
-BITS 32
 ;***********************************************************************
 ; Local Data (Read Only)
 ;***********************************************************************
@@ -177,11 +176,11 @@
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
-        lea         eax, [eax+2*ecx]
-        movzx		edx, byte [eax-0x01]
-        add			ebx, edx
-        movzx		edx, byte [eax+ecx-0x01]
-        add			ebx, edx
+        lea         r1, [r1+2*r2]
+        movzx		r4, byte [r1-0x01]
+        add			r3, r4
+        movzx		r4, byte [r1+r2-0x01]
+        add			r3, r4
 %endmacro
 
 ;***********************************************************************
@@ -201,34 +200,36 @@
 ;	pred must align to 16
 ;***********************************************************************
 WelsI4x4LumaPredH_sse2:
-	mov			eax,	[esp+8]			;pRef
-	mov			ecx,	[esp+12]		;stride
-
-	movzx		edx,	byte [eax-1]
-	movd		xmm0,	edx
+	push r3
+	%assign push_num 1
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movzx		r3,	byte [r1-1]
+	movd		xmm0,	r3d
 	pmuludq		xmm0,	[mmx_01bytes]
 
-	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm1,	edx
+	movzx		r3,	byte [r1+r2-1]
+	movd		xmm1,	r3d
 	pmuludq		xmm1,	[mmx_01bytes]
 
 	unpcklps	xmm0,	xmm1
 
-	lea			eax,	[eax+ecx*2]
-	movzx		edx,	byte [eax-1]
-	movd		xmm2,	edx
+	lea			r1,	[r1+r2*2]
+	movzx		r3,	byte [r1-1]
+	movd		xmm2,	r3d
 	pmuludq		xmm2,	[mmx_01bytes]
 
-	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm3,	edx
+	movzx		r3,	byte [r1+r2-1]
+	movd		xmm3,	r3d
 	pmuludq		xmm3,	[mmx_01bytes]
 
 	unpcklps	xmm2,	xmm3
 	unpcklpd	xmm0,	xmm2
 
-	mov			edx,	[esp+4]			;pred
-	movdqa		[edx],	xmm0
-
+	movdqa		[r0],	xmm0
+	pop r3
 	ret
 
 ;***********************************************************************
@@ -235,20 +236,27 @@
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WelsI16x16LumaPredPlane_sse2:
-%define pushsize	4
-		push	esi
-		mov		esi,	[esp + pushsize + 8]
-		mov		ecx,	[esp + pushsize + 12]
-		sub		esi,	1
-		sub		esi,	ecx
+		;%define pushsize	4
+		;push	esi
+		;mov		esi,	[esp + pushsize + 8]
+		;mov		ecx,	[esp + pushsize + 12]
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_3_PARA
+		%ifndef X86_32
+		movsx r2, r2d
+		%endif
+		sub		r1,	1
+		sub		r1,	r2
 
 		;for H
 		pxor	xmm7,	xmm7
-		movq	xmm0,	[esi]
+		movq	xmm0,	[r1]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
 		pmullw	xmm0,	xmm5
-		movq	xmm1,	[esi + 9]
+		movq	xmm1,	[r1 + 9]
 		movdqa	xmm6,	[sse2_plane_inc]
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
@@ -255,25 +263,25 @@
 		psubw	xmm1,	xmm0
 
 		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	eax,	ax
-		imul	eax,	5
-		add		eax,	32
-		sar		eax,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
+		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	r3,	r3w
+		imul	r3,	5
+		add		r3,	32
+		sar		r3,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
 
-		movzx	edx,	BYTE [esi+16]
-		sub	esi, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
+		movzx	r4,	BYTE [r1+16]
+		sub	r1, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
 
-		add		esi,	3
-		movzx	eax,	BYTE [esi+8*ecx]
-		add		edx,	eax
-		shl		edx,	4			;	a = (left[15*stride] + top[15]) << 4;
+		add		r1,	3
+		movzx	r3,	BYTE [r1+8*r2]
+		add		r4,	r3
+		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
 
-		sub	esi, 3
-		add		esi,	ecx
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
+		sub	r1, 3
+		add		r1,	r2
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
 		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
@@ -282,21 +290,20 @@
 		psubw	xmm7,	xmm0
 
 		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    eax,   xmm7			; V
-		movsx	eax,	ax
+		movd    r3d,   xmm7			; V
+		movsx	r3,	r3w
+		imul	r3,	5
+		add		r3,	32
+		sar		r3,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
 
-		imul	eax,	5
-		add		eax,	32
-		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+		;mov		esi,	[esp + pushsize + 4]
+		add		r4,	16
+		imul	r3,	-7
+		add		r3,	r4				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
 
-		mov		esi,	[esp + pushsize + 4]
-		add		edx,	16
-		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		eax,	eax
+		xor		r3,	r3
 		movdqa	xmm5,	[sse2_plane_inc_minus]
 
 get_i16x16_luma_pred_plane_sse2_1:
@@ -309,51 +316,56 @@
 		paddw	xmm3,	xmm0
 		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
-		movdqa	[esi],	xmm2
+		movdqa	[r0],	xmm2
 		paddw	xmm0,	xmm4
-		add		esi,	16
-		inc		eax
-		cmp		eax,	16
+		add		r0,	16
+		inc		r3
+		cmp		r3,	16
 		jnz get_i16x16_luma_pred_plane_sse2_1
-
-		pop		esi
+		pop r4
+		pop r3
 		ret
 
-
-
 ;***********************************************************************
 ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
-    lea     eax,	[eax+ecx*2]
-
-    COPY_16_TIMES	eax,	xmm0
-    movdqa			[edx+%1],	xmm0
-   COPY_16_TIMESS eax,	xmm0,	ecx
-   ; COPY_16_TIMES	eax + ecx,	xmm0
-    movdqa  [edx+%1+0x10],	xmm0
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+	add r0, 16
+	add r1, r2
+	movzx r3, byte [r1]
+	SSE2_Copy16Times xmm0, r3d
+	movdqa [r0], xmm0
 %endmacro
 
 WELS_EXTERN WelsI16x16LumaPredH_sse2
 WelsI16x16LumaPredH_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+0x10],	xmm0
-
-	SSE2_PRED_H_16X16_TWO_LINE   0x20
-	SSE2_PRED_H_16X16_TWO_LINE   0x40
-	SSE2_PRED_H_16X16_TWO_LINE   0x60
-	SSE2_PRED_H_16X16_TWO_LINE   0x80
-	SSE2_PRED_H_16X16_TWO_LINE   0xa0
-	SSE2_PRED_H_16X16_TWO_LINE   0xc0
-	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-
+	push r3
+	%assign push_num 1
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	dec r1
+	movzx r3, byte [r1]
+	SSE2_Copy16Times xmm0, r3d
+	movdqa [r0], xmm0
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	pop r3
     ret
 
 ;***********************************************************************
@@ -361,30 +373,34 @@
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredV_sse2
 WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
+    ;mov     edx, [esp+4]    ; pred
+    ;mov     eax, [esp+8]	; pRef
+    ;mov     ecx, [esp+12]   ; stride
+    %assign push_num 0
+    LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+    sub     r1, r2
+    movdqa  xmm0, [r1]
 
-    sub     eax, ecx
-    movdqa  xmm0, [eax]
+    movdqa  [r0], xmm0
+    movdqa  [r0+10h], xmm0
+    movdqa  [r0+20h], xmm0
+    movdqa  [r0+30h], xmm0
+    movdqa  [r0+40h], xmm0
+    movdqa  [r0+50h], xmm0
+    movdqa  [r0+60h], xmm0
+    movdqa  [r0+70h], xmm0
+    movdqa  [r0+80h], xmm0
+    movdqa  [r0+90h], xmm0
+    movdqa  [r0+160], xmm0
+    movdqa  [r0+176], xmm0
+    movdqa  [r0+192], xmm0
+    movdqa  [r0+208], xmm0
+    movdqa  [r0+224], xmm0
+    movdqa  [r0+240], xmm0
 
-    movdqa  [edx], xmm0
-    movdqa  [edx+10h], xmm0
-    movdqa  [edx+20h], xmm0
-    movdqa  [edx+30h], xmm0
-    movdqa  [edx+40h], xmm0
-    movdqa  [edx+50h], xmm0
-    movdqa  [edx+60h], xmm0
-    movdqa  [edx+70h], xmm0
-    movdqa  [edx+80h], xmm0
-    movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0
-	movdqa  [edx+176], xmm0
-    movdqa  [edx+192], xmm0
-    movdqa  [edx+208], xmm0
-    movdqa  [edx+224], xmm0
-    movdqa  [edx+240], xmm0
-
     ret
 
 ;***********************************************************************
@@ -392,19 +408,26 @@
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredPlane_sse2
 WelsIChromaPredPlane_sse2:
-%define pushsize	4
-		push	esi
-		mov		esi,	[esp + pushsize + 8]	;pRef
-		mov		ecx,	[esp + pushsize + 12]	;stride
-		sub		esi,	1
-		sub		esi,	ecx
+		;%define pushsize	4
+		;push	esi
+		;mov		esi,	[esp + pushsize + 8]	;pRef
+		;mov		ecx,	[esp + pushsize + 12]	;stride
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_3_PARA
+		%ifndef X86_32
+		movsx r2, r2d
+		%endif
+		sub		r1,	1
+		sub		r1,	r2
 
 		pxor	mm7,	mm7
-		movq	mm0,	[esi]
+		movq	mm0,	[r1]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
 		pmullw	mm0,	mm5
-		movq	mm1,	[esi + 5]
+		movq	mm1,	[r1 + 5]
 		movq	mm6,	[sse2_plane_inc_c]
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
@@ -413,25 +436,25 @@
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    eax,	xmm1
-		movsx	eax,	ax
-		imul	eax,	17
-		add		eax,	16
-		sar		eax,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
+		movd    r3d,	xmm1
+		movsx	r3,	r3w
+		imul	r3,	17
+		add		r3,	16
+		sar		r3,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
 
-		movzx	edx,	BYTE [esi+8]
-		sub	esi, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
+		movzx	r3,	BYTE [r1+8]
+		sub	r1, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
 
-		add		esi,	3
-		movzx	eax,	BYTE [esi+4*ecx]
-		add		edx,	eax
-		shl		edx,	4			; a = (left[7*stride] + top[7]) << 4;
+		add		r1,	3
+		movzx	r4,	BYTE [r1+4*r2]
+		add		r4,	r3
+		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
 
-		sub	esi, 3
-		add		esi,	ecx
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
+		sub	r1, 3
+		add		r1,	r2
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
 		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
@@ -442,21 +465,20 @@
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    eax,    xmm7			; V
-		movsx	eax,	ax
+		movd    r3d,    xmm7			; V
+		movsx	r3,	r3w
+		imul	r3,	17
+		add		r3,	16
+		sar		r3,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
 
-		imul	eax,	17
-		add		eax,	16
-		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+		;mov		esi,	[esp + pushsize + 4]
+		add		r4,	16
+		imul	r3,	-3
+		add		r3,	r4		; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
 
-		mov		esi,	[esp + pushsize + 4]
-		add		edx,	16
-		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
-
-		xor		eax,	eax
+		xor		r3,	r3
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
 
 get_i_chroma_pred_plane_sse2_1:
@@ -465,14 +487,14 @@
 		paddw	xmm2,	xmm0
 		psraw	xmm2,	5
 		packuswb xmm2,	xmm2
-		movq	[esi],	xmm2
+		movq	[r0],	xmm2
 		paddw	xmm0,	xmm4
-		add		esi,	8
-		inc		eax
-		cmp		eax,	8
+		add		r0,	8
+		inc		r3
+		cmp		r3,	8
 		jnz get_i_chroma_pred_plane_sse2_1
-
-		pop		esi
+		pop r4
+		pop r3
 		WELSEMMS
 		ret
 
@@ -490,27 +512,31 @@
 ;
 ;***********************************************************************
 WelsI4x4LumaPredDDR_mmx:
-	mov			edx,[esp+4]			;pred
-	mov         eax,[esp+8]			;pRef
-	mov			ecx,[esp+12]		;stride
-
-	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
-	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[eax-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[eax]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	;mov			edx,[esp+4]			;pred
+	;mov         eax,[esp+8]			;pRef
+	;mov			ecx,[esp+12]		;stride
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
+	sub		r1, r2			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
 	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
 	psllq       mm3,18h				;mm3[5]=[1]
 	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
 	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
 	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea			eax,[eax+ecx*2-8h]		;set eax point to 12
-	movq        mm4,[eax+ecx]		;get value of 16, mm4[8]=[16]
+	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
+	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
 	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
 	psrlq       mm4,38h				;mm4[1]=[16]
 	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
 	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[eax+ecx*2]		;mm4[8]=[21]
+	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
 	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
 	psrlq       mm4,38h				;mm4[1]=[21]
 	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
@@ -521,13 +547,13 @@
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
 
-	movd        [edx+12],mm2
+	movd        [r0+12],mm2
 	psrlq       mm2,8
-	movd        [edx+8],mm2
+	movd        [r0+8],mm2
 	psrlq       mm2,8
-	movd        [edx+4],mm2
+	movd        [r0+4],mm2
 	psrlq       mm2,8
-	movd        [edx],mm2
+	movd        [r0],mm2
 	WELSEMMS
 	ret
 
@@ -545,39 +571,39 @@
 ;
 ;***********************************************************************
 WelsI4x4LumaPredDc_sse2:
-	mov         eax,[esp+8]			;pRef
-	mov			ecx,[esp+12]		;stride
-	push		ebx
-
-	movzx		edx,	byte [eax-1h]
-
-	sub			eax,	ecx
-	movd		xmm0,	[eax]
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movzx		r4,	byte [r1-1h]
+	sub			r1,	r2
+	movd		xmm0,	[r1]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
+	xor r3, r3
+	movd		r3d,	xmm0
+	add			r3,	r4
+	movzx		r4,	byte [r1+r2*2-1h]
+	add			r3,	r4
 
-	movd		ebx,	xmm0
-	add			ebx,	edx
+	lea			r1,	[r1+r2*2-1]
+	movzx		r4,	byte [r1+r2]
+	add			r3,	r4
 
-	movzx		edx,	byte [eax+ecx*2-1h]
-	add			ebx,	edx
+	movzx		r4,	byte [r1+r2*2]
+	add			r3,	r4
+	add			r3,	4
+	sar			r3,	3
+	imul		r3,	0x01010101
 
-	lea			eax,	[eax+ecx*2-1]
-	movzx		edx,	byte [eax+ecx]
-	add			ebx,	edx
-
-	movzx		edx,	byte [eax+ecx*2]
-	add			ebx,	edx
-	add			ebx,	4
-	sar			ebx,	3
-	imul		ebx,	0x01010101
-
-	mov			edx,	[esp+8]			;pred
-	movd		xmm0,	ebx
+	movd		xmm0,	r3d
 	pshufd		xmm0,	xmm0,	0
-	movdqa		[edx],	xmm0
-
-	pop ebx
+	movdqa		[r0],	xmm0
+	pop r4
+	pop r3
 	ret
 
 ALIGN 16
@@ -596,7 +622,7 @@
 %endmacro
 
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+ecx-8]
+	movq		%1,		[%3+r2-8]
 	psrlq		%1,		38h
 
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
@@ -607,34 +633,38 @@
 
 WELS_EXTERN WelsIChromaPredH_mmx
 WelsIChromaPredH_mmx:
-	mov			edx,	[esp+4]			;pred
-	mov         eax,	[esp+8]			;pRef
-	mov			ecx,	[esp+12]		;stride
-
-	movq		mm0,	[eax-8]
+	;mov			edx,	[esp+4]			;pred
+	;mov         eax,	[esp+8]			;pRef
+	;mov			ecx,	[esp+12]		;stride
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movq		mm0,	[r1-8]
 	psrlq		mm0,	38h
 
 	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
-	movq		[edx],	mm0
+	movq		[r0],	mm0
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+8
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
 
-	lea			eax,[eax+ecx*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+16
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+24
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
 
-	lea			eax,[eax+ecx*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+32
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+40
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
 
-	lea			eax,[eax+ecx*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+48
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
 	WELSEMMS
 	ret
 
@@ -645,14 +675,15 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredV_sse2
 WelsI4x4LumaPredV_sse2:
-	mov			edx,	[esp+4]			;pred
-	mov         eax,	[esp+8]			;pRef
-	mov			ecx,	[esp+12]		;stride
-
-	sub			eax,	ecx
-	movd		xmm0,	[eax]
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub			r1,	r2
+	movd		xmm0,	[r1]
 	pshufd		xmm0,	xmm0,	0
-	movdqa		[edx],	xmm0
+	movdqa		[r0],	xmm0
 	ret
 
 ALIGN 16
@@ -662,22 +693,21 @@
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredV_sse2
 WelsIChromaPredV_sse2:
-	mov			edx,		[esp+4]			;pred
-	mov         eax,		[esp+8]			;pRef
-	mov			ecx,		[esp+12]		;stride
-
-	sub			eax,		ecx
-	movq		xmm0,		[eax]
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub		r1,		r2
+	movq		xmm0,		[r1]
 	movdqa		xmm1,		xmm0
 	punpcklqdq	xmm0,		xmm1
-
-	movdqa		[edx],		xmm0
-	movdqa		[edx+16],	xmm0
-	movdqa		[edx+32],	xmm0
-	movdqa		[edx+48],	xmm0
+	movdqa		[r0],		xmm0
+	movdqa		[r0+16],	xmm0
+	movdqa		[r0+32],	xmm0
+	movdqa		[r0+48],	xmm0
 	ret
 
-
 	ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -710,18 +740,20 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
 WelsI4x4LumaPredHD_mmx:
-	mov			edx, [esp+4]			; pred
-	mov         eax, [esp+8]			; pRef
-	mov			ecx, [esp+12]           ; stride
-	sub         eax, ecx
-	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
 
-	movd        mm1, [eax+2*ecx-4]
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]
-	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
+	movd        mm1, [r1+2*r2-4]
+	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r1, [r1+2*r2]
+	movd        mm2, [r1+2*r2-4]
+	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
@@ -751,17 +783,15 @@
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
 
-	movd        [edx], mm2
-	movd        [edx+12], mm3
+	movd        [r0], mm2
+	movd        [r0+12], mm3
 	psrlq       mm3, 10h
-	movd        [edx+8], mm3
+	movd        [r0+8], mm3
 	psrlq       mm3, 10h
-	movd        [edx+4], mm3
+	movd        [r0+4], mm3
 	WELSEMMS
 	ret
 
-
-
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -791,15 +821,16 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
 WelsI4x4LumaPredHU_mmx:
-	mov			edx, [esp+4]			; pred
-	mov         eax, [esp+8]			; pRef
-	mov			ecx, [esp+12]           ; stride
-
-	movd        mm0, [eax-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax-4]            ; mm2[3] = l2
-	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movd        mm0, [r1-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         r1, [r1+2*r2]
+	movd        mm2, [r1-4]            ; mm2[3] = l2
+	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
 
@@ -832,13 +863,13 @@
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
 
 	psrlq       mm4, 20h
-	movd        [edx+12], mm4
+	movd        [r0+12], mm4
 
-	movd        [edx], mm1
+	movd        [r0], mm1
 	psrlq       mm1, 10h
-	movd        [edx+4], mm1
+	movd        [r0+4], mm1
 	psrlq       mm1, 10h
-	movd        [edx+8], mm1
+	movd        [r0+8], mm1
 	WELSEMMS
 	ret
 
@@ -875,17 +906,19 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
 WelsI4x4LumaPredVR_mmx:
-	mov			edx, [esp+4]			; pred
-	mov         eax, [esp+8]			; pRef
-	mov			ecx, [esp+12]           ; stride
-	sub         eax, ecx
-	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
 
-	movd        mm1, [eax+2*ecx-4]
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         eax, [eax+2*ecx]
-	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
+	movd        mm1, [r1+2*r2-4]
+	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r1, [r1+2*r2]
+	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
@@ -909,10 +942,10 @@
 	movq        mm2, mm3
 
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [edx], mm1
+	movd        [r0], mm1
 
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [edx+4], mm2
+	movd        [r0+4], mm2
 
 	movq        mm4, mm3
 	psllq       mm4, 20h
@@ -924,11 +957,11 @@
 
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [edx+8], mm4
+	movd        [r0+8], mm4
 
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	movd        [edx+12], mm5
+	movd        [r0+12], mm5
 	WELSEMMS
 	ret
 
@@ -961,11 +994,13 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
 WelsI4x4LumaPredDDL_mmx:
-	mov			edx, [esp+4]			; pred
-	mov         eax, [esp+8]			; pRef
-	mov			ecx, [esp+12]           ; stride
-	sub         eax, ecx
-	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
 
@@ -986,13 +1021,13 @@
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
 
 	psrlq       mm0, 8h
-	movd        [edx], mm0
+	movd        [r0], mm0
 	psrlq       mm0, 8h
-	movd        [edx+4], mm0
+	movd        [r0+4], mm0
 	psrlq       mm0, 8h
-	movd        [edx+8], mm0
+	movd        [r0+8], mm0
 	psrlq       mm0, 8h
-	movd        [edx+12], mm0
+	movd        [r0+12], mm0
 	WELSEMMS
 	ret
 
@@ -1029,12 +1064,13 @@
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
 WelsI4x4LumaPredVL_mmx:
-	mov			edx, [esp+4]			; pred
-	mov         eax, [esp+8]			; pRef
-	mov			ecx, [esp+12]           ; stride
-
-	sub         eax, ecx
-	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
 
@@ -1052,13 +1088,13 @@
 
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
 
-	movd        [edx], mm3
+	movd        [r0], mm3
 	psrlq       mm3, 8h
-	movd        [edx+8], mm3
+	movd        [r0+8], mm3
 
-	movd        [edx+4], mm2
+	movd        [r0+4], mm2
 	psrlq       mm2, 8h
-	movd        [edx+12], mm2
+	movd        [r0+12], mm2
 	WELSEMMS
 	ret
 
@@ -1069,41 +1105,38 @@
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
 WelsIChromaPredDc_sse2:
-	push        ebx
-	mov         eax, [esp+12]			; pRef
-	mov			ecx, [esp+16]           ; stride
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1]
 
-	sub         eax, ecx
-	movq        mm0, [eax]
+	movzx		r3, byte [r1+r2-0x01] ; l1
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l2
+	add		r3, r4
+	movzx		r4, byte [r1+r2-0x01] ; l3
+	add		r3, r4
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l4
+	add		r3, r4
+	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
 
-	;xor         ebx, ebx
-	;movzx		edx, byte [eax+ecx-0x01] ; l1
-	movzx		ebx, byte [eax+ecx-0x01] ; l1
-	;mov			ebx, edx
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l2
-	add			ebx, edx
-	movzx		edx, byte [eax+ecx-0x01] ; l3
-	add			ebx, edx
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l4
-	add			ebx, edx
-	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
+	movzx		r3, byte [r1+r2-0x01] ; l5
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l6
+	add		r3, r4
+	movzx		r4, byte [r1+r2-0x01] ; l7
+	add		r3, r4
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l8
+	add		r3, r4
+	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
 
-	;xor         ebx, ebx
-	;movzx		edx, byte [eax+ecx-0x01] ; l5
-	movzx		ebx, byte [eax+ecx-0x01] ; l5
-	;mov		ebx, edx
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l6
-	add			ebx, edx
-	movzx		edx, byte [eax+ecx-0x01] ; l7
-	add			ebx, edx
-	lea         eax, [eax+2*ecx]
-	movzx		edx, byte [eax-0x01]     ; l8
-	add			ebx, edx
-	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1142,19 +1175,18 @@
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
 
-	mov         edx, [esp+8]			 ; pRef
+	movq        [r0], mm0
+	movq        [r0+0x08], mm0
+	movq        [r0+0x10], mm0
+	movq        [r0+0x18], mm0
 
-	movq        [edx], mm0
-	movq        [edx+0x08], mm0
-	movq        [edx+0x10], mm0
-	movq        [edx+0x18], mm0
+	movq        [r0+0x20], mm1
+	movq        [r0+0x28], mm1
+	movq        [r0+0x30], mm1
+	movq        [r0+0x38], mm1
 
-	movq        [edx+0x20], mm1
-	movq        [edx+0x28], mm1
-	movq        [edx+0x30], mm1
-	movq        [edx+0x38], mm1
-
-	pop         ebx
+	pop r4
+	pop r3
 	WELSEMMS
 	ret
 
@@ -1167,12 +1199,15 @@
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
 WelsI16x16LumaPredDc_sse2:
-	push        ebx
-	mov         eax, [esp+12]			; pRef
-	mov			ecx, [esp+16]           ; stride
-
-	sub         eax, ecx
-	movdqa      xmm0, [eax]             ; read one row
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movdqa      xmm0, [r1]             ; read one row
 	pxor		xmm1, xmm1
 	psadbw		xmm0, xmm1
 	movdqa      xmm1, xmm0
@@ -1181,13 +1216,10 @@
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
 
-	;xor         ebx, ebx
-	;movzx		edx, byte [eax+ecx-0x01]
-	movzx		ebx, byte [eax+ecx-0x01]
-	;mov			ebx, edx
-	movzx		edx, byte [eax+2*ecx-0x01]
-	add			ebx, edx
-	lea         eax, [eax+ecx]
+	movzx		r3, byte [r1+r2-0x01]
+	movzx		r4, byte [r1+2*r2-0x01]
+	add		r3, r4
+	lea         r1, [r1+r2]
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
@@ -1195,33 +1227,32 @@
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
 	LOAD_2_LEFT_AND_ADD
-	add         ebx, 0x10
-	movd        xmm1, ebx
+	add         r3, 0x10
+	movd        xmm1, r3d
 	paddw       xmm0, xmm1
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
 
-	mov         edx, [esp+8]			; pred
-	movdqa      [edx], xmm0
-	movdqa      [edx+0x10], xmm0
-	movdqa      [edx+0x20], xmm0
-	movdqa      [edx+0x30], xmm0
-	movdqa      [edx+0x40], xmm0
-	movdqa      [edx+0x50], xmm0
-	movdqa      [edx+0x60], xmm0
-	movdqa      [edx+0x70], xmm0
-	movdqa      [edx+0x80], xmm0
-	movdqa      [edx+0x90], xmm0
-	movdqa      [edx+0xa0], xmm0
-	movdqa      [edx+0xb0], xmm0
-	movdqa      [edx+0xc0], xmm0
-	movdqa      [edx+0xd0], xmm0
-	movdqa      [edx+0xe0], xmm0
-	movdqa      [edx+0xf0], xmm0
+	movdqa      [r0], xmm0
+	movdqa      [r0+0x10], xmm0
+	movdqa      [r0+0x20], xmm0
+	movdqa      [r0+0x30], xmm0
+	movdqa      [r0+0x40], xmm0
+	movdqa      [r0+0x50], xmm0
+	movdqa      [r0+0x60], xmm0
+	movdqa      [r0+0x70], xmm0
+	movdqa      [r0+0x80], xmm0
+	movdqa      [r0+0x90], xmm0
+	movdqa      [r0+0xa0], xmm0
+	movdqa      [r0+0xb0], xmm0
+	movdqa      [r0+0xc0], xmm0
+	movdqa      [r0+0xd0], xmm0
+	movdqa      [r0+0xe0], xmm0
+	movdqa      [r0+0xf0], xmm0
 
-	pop         ebx
-
+	pop r4
+	pop r3
 	ret
 
 ;***********************************************************************
@@ -1230,6 +1261,7 @@
 ;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
 ;
 ;***********************************************************************
+%ifdef X86_32
 WELS_EXTERN WelsSmpleSatdThree4x4_sse2
 align 16
 WelsSmpleSatdThree4x4_sse2:
@@ -1469,5 +1501,5 @@
     pop       esi
     pop       ebx
     ret
-
+%endif
 
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ /dev/null
@@ -1,156 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred_util.asm
-;*
-;*  Abstract
-;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
-;*		WelsFillingPred1to16 etc.
-;*
-;*  History
-;*      09/29/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-SECTION .text
-
-WELS_EXTERN WelsFillingPred8to16_mmx
-WELS_EXTERN WelsFillingPred8x2to16_mmx
-WELS_EXTERN WelsFillingPred1to16_mmx
-WELS_EXTERN WelsFillingPred8x2to16_sse2
-WELS_EXTERN WelsFillingPred1to16_sse2
-
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred8to16_mmx( uint8_t *pred, uint8_t *v );
-;***********************************************************************----------------
-WelsFillingPred8to16_mmx:
-	mov eax, [esp+4]	; pred
-	mov ecx, [esp+8]	; v
-
-	movq mm0, [ecx]
-	movq [eax  ], mm0
-	movq [eax+8], mm0
-
-	WELSEMMS
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred8x2to16_mmx( uint8_t *pred, uint8_t *v );
-;***********************************************************************----------------
-WelsFillingPred8x2to16_mmx:
-	mov eax, [esp+4]	; pred
-	mov ecx, [esp+8]	; v
-
-	movq mm0, [ecx  ]
-	movq mm1, [ecx+8]
-	movq [eax  ], mm0
-	movq [eax+8], mm1
-
-	WELSEMMS
-
-	ret
-
-%macro butterfly_1to8_mmx	3	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %2, e%3x		; i.e, 1% = eax (=b0)
-	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred1to16_mmx( uint8_t *pred, const uint8_t v );
-;***********************************************************************----------------
-WelsFillingPred1to16_mmx:
-	mov eax, [esp+4]		; pred
-
-	mov cl, byte [esp+8]	; v
-	butterfly_1to8_mmx	mm0, mm1, c	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-
-	movq [eax  ], mm0
-	movq [eax+8], mm0
-
-	WELSEMMS
-
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred8x2to16_sse2( uint8_t *pred, uint8_t *v );
-;***********************************************************************----------------
-WelsFillingPred8x2to16_sse2:
-	mov eax, [esp+4]	; pred
-	mov ecx, [esp+8]	; v
-
-	movdqa xmm0, [ecx]
-	movdqa [eax], xmm0
-
-	ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred1to16_sse2( uint8_t *pred, const uint8_t v );
-;***********************************************************************----------------
-WelsFillingPred1to16_sse2:
-	mov eax, [esp+4]		; pred
-
-	mov cl, byte [esp+8]	; v
-	butterfly_1to16_sse	xmm0, xmm1, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-
-	movdqa [eax], xmm0
-
-	ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ /dev/null
@@ -1,687 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mb_copy.asm
-;*
-;*  Abstract
-;*      mb_copy
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN WelsCopy16x16_sse2
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
-WELS_EXTERN WelsCopy8x16_mmx		;
-WELS_EXTERN UpdateMbMv_sse2		;
-
-;***********************************************************************
-; void WelsCopy16x16_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x16_sse2:
-	push esi
-	push edi
-	push ebx
-
-	mov edi, [esp+16]	; Dst
-	mov eax, [esp+20]	; iStrideD
-	mov esi, [esp+24]	; Src
-	mov ecx, [esp+28]	; iStrideS
-
-	lea ebx, [eax+2*eax]	; x3
-	lea edx, [ecx+2*ecx]	; x3
-
-	movdqa xmm0, [esi]
-	movdqa xmm1, [esi+ecx]
-	movdqa xmm2, [esi+2*ecx]
-	movdqa xmm3, [esi+edx]
-	lea esi, [esi+4*ecx]
-	movdqa xmm4, [esi]
-	movdqa xmm5, [esi+ecx]
-	movdqa xmm6, [esi+2*ecx]
-	movdqa xmm7, [esi+edx]
-	lea esi, [esi+4*ecx]
-
-	movdqa [edi], xmm0
-	movdqa [edi+eax], xmm1
-	movdqa [edi+2*eax], xmm2
-	movdqa [edi+ebx], xmm3
-	lea edi, [edi+4*eax]
-	movdqa [edi], xmm4
-	movdqa [edi+eax], xmm5
-	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7
-	lea edi, [edi+4*eax]
-
-	movdqa xmm0, [esi]
-	movdqa xmm1, [esi+ecx]
-	movdqa xmm2, [esi+2*ecx]
-	movdqa xmm3, [esi+edx]
-	lea esi, [esi+4*ecx]
-	movdqa xmm4, [esi]
-	movdqa xmm5, [esi+ecx]
-	movdqa xmm6, [esi+2*ecx]
-	movdqa xmm7, [esi+edx]
-
-	movdqa [edi], xmm0
-	movdqa [edi+eax], xmm1
-	movdqa [edi+2*eax], xmm2
-	movdqa [edi+ebx], xmm3
-	lea edi, [edi+4*eax]
-	movdqa [edi], xmm4
-	movdqa [edi+eax], xmm5
-	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7
-
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WelsCopy16x16NotAligned_sse2:
-	push esi
-	push edi
-	push ebx
-
-	mov edi, [esp+16]	; Dst
-	mov eax, [esp+20]	; iStrideD
-	mov esi, [esp+24]	; Src
-	mov ecx, [esp+28]	; iStrideS
-
-	lea ebx, [eax+2*eax]	; x3
-	lea edx, [ecx+2*ecx]	; x3
-
-	movdqu xmm0, [esi]
-	movdqu xmm1, [esi+ecx]
-	movdqu xmm2, [esi+2*ecx]
-	movdqu xmm3, [esi+edx]
-	lea esi, [esi+4*ecx]
-	movdqu xmm4, [esi]
-	movdqu xmm5, [esi+ecx]
-	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]
-	lea esi, [esi+4*ecx]
-
-	movdqa [edi], xmm0
-	movdqa [edi+eax], xmm1
-	movdqa [edi+2*eax], xmm2
-	movdqa [edi+ebx], xmm3
-	lea edi, [edi+4*eax]
-	movdqa [edi], xmm4
-	movdqa [edi+eax], xmm5
-	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7
-	lea edi, [edi+4*eax]
-
-	movdqu xmm0, [esi]
-	movdqu xmm1, [esi+ecx]
-	movdqu xmm2, [esi+2*ecx]
-	movdqu xmm3, [esi+edx]
-	lea esi, [esi+4*ecx]
-	movdqu xmm4, [esi]
-	movdqu xmm5, [esi+ecx]
-	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]
-
-	movdqa [edi], xmm0
-	movdqa [edi+eax], xmm1
-	movdqa [edi+2*eax], xmm2
-	movdqa [edi+ebx], xmm3
-	lea edi, [edi+4*eax]
-	movdqa [edi], xmm4
-	movdqa [edi+eax], xmm5
-	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7
-
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-; , 12/29/2011
-;***********************************************************************
-; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x8NotAligned_sse2:
-	push esi
-	push edi
-	push ebx
-
-	mov edi, [esp+16]	; Dst
-	mov eax, [esp+20]	; iStrideD
-	mov esi, [esp+24]	; Src
-	mov ecx, [esp+28]	; iStrideS
-
-	lea ebx, [eax+2*eax]	; x3
-	lea edx, [ecx+2*ecx]	; x3
-
-	movdqu xmm0, [esi]
-	movdqu xmm1, [esi+ecx]
-	movdqu xmm2, [esi+2*ecx]
-	movdqu xmm3, [esi+edx]
-	lea esi, [esi+4*ecx]
-	movdqu xmm4, [esi]
-	movdqu xmm5, [esi+ecx]
-	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]
-
-	movdqa [edi], xmm0
-	movdqa [edi+eax], xmm1
-	movdqa [edi+2*eax], xmm2
-	movdqa [edi+ebx], xmm3
-	lea edi, [edi+4*eax]
-	movdqa [edi], xmm4
-	movdqa [edi+eax], xmm5
-	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7
-
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-;***********************************************************************
-; void WelsCopy8x16_mmx(uint8_t* Dst,
-;                       int32_t  iStrideD,
-;                       uint8_t* Src,
-;                       int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x16_mmx:
-	push ebx
-
-	mov eax, [esp + 8 ]           ;Dst
-	mov ecx, [esp + 12]           ;iStrideD
-	mov ebx, [esp + 16]           ;Src
-	mov edx, [esp + 20]           ;iStrideS
-
-	movq mm0, [ebx]
-	movq mm1, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]
-	movq mm3, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]
-	movq mm5, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]
-	movq mm7, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-
-	movq [eax], mm0
-	movq [eax+ecx], mm1
-	lea eax, [eax+2*ecx]
-	movq [eax], mm2
-	movq [eax+ecx], mm3
-	lea eax, [eax+2*ecx]
-	movq [eax], mm4
-	movq [eax+ecx], mm5
-	lea eax, [eax+2*ecx]
-	movq [eax], mm6
-	movq [eax+ecx], mm7
-	lea eax, [eax+2*ecx]
-
-	movq mm0, [ebx]
-	movq mm1, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]
-	movq mm3, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]
-	movq mm5, [ebx+edx]
-	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]
-	movq mm7, [ebx+edx]
-
-	movq [eax], mm0
-	movq [eax+ecx], mm1
-	lea eax, [eax+2*ecx]
-	movq [eax], mm2
-	movq [eax+ecx], mm3
-	lea eax, [eax+2*ecx]
-	movq [eax], mm4
-	movq [eax+ecx], mm5
-	lea eax, [eax+2*ecx]
-	movq [eax], mm6
-	movq [eax+ecx], mm7
-
-	WELSEMMS
-	pop ebx
-	ret
-
-;***********************************************************************
-; void WelsCopy8x8_mmx(  uint8_t* Dst,
-;                        int32_t  iStrideD,
-;                        uint8_t* Src,
-;                        int32_t  iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x8_mmx:
-	push ebx
-	push esi
-	mov eax, [esp + 12]           ;Dst
-	mov ecx, [esp + 16]           ;iStrideD
-	mov esi, [esp + 20]           ;Src
-	mov ebx, [esp + 24]           ;iStrideS
-	lea edx, [ebx+2*ebx]
-
-	; to prefetch next loop
-	prefetchnta [esi+2*ebx]
-	prefetchnta [esi+edx]
-	movq mm0, [esi]
-	movq mm1, [esi+ebx]
-	lea esi, [esi+2*ebx]
-	; to prefetch next loop
-	prefetchnta [esi+2*ebx]
-	prefetchnta [esi+edx]
-	movq mm2, [esi]
-	movq mm3, [esi+ebx]
-	lea esi, [esi+2*ebx]
-	; to prefetch next loop
-	prefetchnta [esi+2*ebx]
-	prefetchnta [esi+edx]
-	movq mm4, [esi]
-	movq mm5, [esi+ebx]
-	lea esi, [esi+2*ebx]
-	movq mm6, [esi]
-	movq mm7, [esi+ebx]
-
-	movq [eax], mm0
-	movq [eax+ecx], mm1
-	lea eax, [eax+2*ecx]
-	movq [eax], mm2
-	movq [eax+ecx], mm3
-	lea eax, [eax+2*ecx]
-	movq [eax], mm4
-	movq [eax+ecx], mm5
-	lea eax, [eax+2*ecx]
-	movq [eax], mm6
-	movq [eax+ecx], mm7
-
-	WELSEMMS
-	pop esi
-	pop ebx
-	ret
-
-; (dunhuang@cisco), 12/21/2011
-;***********************************************************************
-; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
-;***********************************************************************
-ALIGN 16
-UpdateMbMv_sse2:
-	mov eax, [esp+4]	; mv_buffer
-	movd xmm0, [esp+8]	; _mv
-	pshufd xmm1, xmm0, $0
-	movdqa [eax     ], xmm1
-	movdqa [eax+0x10], xmm1
-	movdqa [eax+0x20], xmm1
-	movdqa [eax+0x30], xmm1
-	ret
-
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-;read unaligned memory
-%macro SSE2_READ_UNA 2
-	movq	%1, [%2]
-	movhps	%1,	[%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE2_WRITE_UNA 2
-	movq	[%1],	%2
-	movhps	[%1+8], %2
-%endmacro
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-
-
-ALIGN 16
-;***********************************************************************
-; void PixelAvgWidthEq8_mmx( uint8_t *dst,  int32_t iDstStride,
-;                           uint8_t *pSrc1, int32_t iSrc1Stride,
-;                           uint8_t *pSrc2, int32_t iSrc2Stride,
-;                           int32_t iHeight );
-;***********************************************************************
-PixelAvgWidthEq8_mmx:
-    push        ebp
-    push        ebx
-    push        esi
-    push        edi
-
-    mov         edi, [esp+20]
-    mov         esi, [esp+28]
-    mov         edx, [esp+36]
-    mov         ebp, [esp+24]
-    mov         eax, [esp+32]
-    mov         ebx, [esp+40]
-    mov         ecx, [esp+44]
-	sar			ecx, 2
-.height_loop:
-	movq        mm0, [esi]
-    pavgb       mm0, [edx]
-    movq        [edi], mm0
-	movq		mm1, [esi+eax]
-	pavgb		mm1, [edx+ebx]
-	movq		[edi+ebp], mm1
-	lea         edi, [edi+2*ebp]
-	lea         esi, [esi+2*eax]
-	lea         edx, [edx+2*ebx]
-
-	movq        mm2, [esi]
-	pavgb       mm2, [edx]
-    movq        [edi], mm2
-	movq		mm3, [esi+eax]
-	pavgb		mm3, [edx+ebx]
-	movq		[edi+ebp], mm3
-	lea         edi, [edi+2*ebp]
-	lea         esi, [esi+2*eax]
-	lea         edx, [edx+2*ebx]
-
-	dec         ecx
-    jne         .height_loop
-
-	WELSEMMS
-    pop         edi
-    pop         esi
-    pop         ebx
-    pop         ebp
-    ret
-
-
-ALIGN 16
-;***********************************************************************
-; void PixelAvgWidthEq16_sse2( uint8_t *dst,  int32_t iDstStride,
-;                          uint8_t *pSrc1, int32_t iSrc1Stride,
-;                          uint8_t *pSrc2, int32_t iSrc2Stride,
-;                          int32_t iHeight );
-;***********************************************************************
-PixelAvgWidthEq16_sse2:
-	push        ebp
-    push        ebx
-    push        esi
-    push        edi
-
-    mov         edi, [esp+20]
-    mov         esi, [esp+28]
-    mov         edx, [esp+36]
-    mov         ebp, [esp+24]
-    mov         eax, [esp+32]
-    mov         ebx, [esp+40]
-    mov         ecx, [esp+44]
-	sar			ecx, 2
-.height_loop:
-	movdqu      xmm0, [esi]
-	movdqu      xmm1, [edx]
-	movdqu      xmm2, [esi+eax]
-	movdqu      xmm3, [edx+ebx]
-	pavgb       xmm0, xmm1
-	pavgb       xmm2, xmm3
-	movdqu      [edi], xmm0
-	movdqu      [edi+ebp], xmm2
-	lea			edi, [edi+2*ebp]
-	lea			esi, [esi+2*eax]
-	lea			edx, [edx+2*ebx]
-
-	movdqu      xmm4, [esi]
-	movdqu      xmm5, [edx]
-	movdqu      xmm6, [esi+eax]
-	movdqu      xmm7, [edx+ebx]
-	pavgb       xmm4, xmm5
-	pavgb       xmm6, xmm7
-	movdqu      [edi], xmm4
-	movdqu      [edi+ebp], xmm6
-	lea         edi, [edi+2*ebp]
-	lea         esi, [esi+2*eax]
-    lea         edx, [edx+2*ebx]
-
-	dec         ecx
-	jne         .height_loop
-
-    pop         edi
-    pop         esi
-    pop         ebx
-    pop         ebp
-    ret
-
-
-ALIGN 64
-avg_w16_align_0_ssse3:
-    movdqa  xmm1, [ebx]
-    movdqu  xmm2, [ecx]
-    pavgb   xmm1, xmm2
-    movdqa  [edi], xmm1
-    add    ebx, eax
-    add    ecx, ebp
-    add    edi, esi
-    dec    dword [esp+4]
-    jg     avg_w16_align_0_ssse3
-    ret
-
-    ALIGN 64
-avg_w16_align_1_ssse3:
-    movdqa  xmm1, [ebx+16]
-    movdqu  xmm2, [ecx]
-    palignr xmm1, [ebx], 1
-    pavgb   xmm1, xmm2
-    movdqa  [edi], xmm1
-    add    ebx, eax
-    add    ecx, ebp
-    add    edi, esi
-    dec    dword [esp+4]
-    jg     avg_w16_align_1_ssse3
-    ret
-
-
-ALIGN 16
-;***********************************************************************
-; void PixelAvgWidthEq16_ssse3(uint8_t *pDst,  int32_t iDstStride,
-;                          uint8_t *pSrc1, int32_t iSrc1Stride,
-;                          uint8_t *pSrc2, int32_t iSrc2Stride,
-;                          int32_t iHeight );
-;***********************************************************************
-WELS_EXTERN PixelAvgWidthEq16_ssse3
-PixelAvgWidthEq16_ssse3:
-    push        ebp
-    push        ebx
-    push        esi
-    push        edi
-
-    mov         edi, [esp+20]       ; dst
-    mov         ebx, [esp+28]       ; src1
-    mov         ecx, [esp+36]       ; src2
-    mov         esi, [esp+24]       ; i_dst_stride
-
-     %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
-    mov edx, ebx
-    and edx, 0x01
-    lea eax, [avg_w16_align_0_ssse3]
-    lea ebp, [avg_w16_offset]
-    imul ebp, edx
-    lea edx, [ebp+eax]
-
-    mov eax, [esp+32]
-    mov ebp, [esp+44]
-    push ebp
-    mov ebp, [esp+44]
-    and ebx, 0xfffffff0
-    call edx
-	pop		   ebp
-    pop         edi
-    pop         esi
-    pop         ebx
-    pop         ebp
-    ret
-
-
-ALIGN 16
-;*******************************************************************************
-;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int32_t iSrcStride,
-;                          uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-;*******************************************************************************
-McCopyWidthEq4_mmx:
-    push    esi
-    push    edi
-    push    ebx
-
-
-    mov esi,  [esp+16]
-    mov eax, [esp+20]
-    mov edi,  [esp+24]
-    mov ecx,  [esp+28]
-    mov edx,  [esp+32]
-ALIGN 4
-.height_loop:
-	mov ebx, [esi]
-	mov [edi], ebx
-
-	add esi, eax
-	add edi, ecx
-	dec edx
-	jnz .height_loop
-	WELSEMMS
-	pop	   ebx
-    pop     edi
-    pop     esi
-    ret
-
-ALIGN 16
-;*******************************************************************************
-;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int32_t iSrcStride,
-;                           uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-;*******************************************************************************
-McCopyWidthEq8_mmx:
-    push  esi
-    push  edi
-	mov  esi, [esp+12]
-	mov eax, [esp+16]
-	mov edi, [esp+20]
-	mov ecx, [esp+24]
-	mov edx, [esp+28]
-
-ALIGN 4
-.height_loop:
-	movq mm0, [esi]
-	movq [edi], mm0
-	add esi, eax
-	add edi, ecx
-	dec edx
-	jnz .height_loop
-
-	WELSEMMS
-    pop     edi
-    pop     esi
-    ret
-
-ALIGN 16
-;***********************************************************************
-;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-;***********************************************************************
-McCopyWidthEq16_sse2:
-    push    esi
-    push    edi
-
-    mov     esi, [esp+12]
-    mov     eax, [esp+16]
-    mov     edi, [esp+20]
-    mov     edx, [esp+24]
-    mov     ecx, [esp+28]
-
-ALIGN 4
-.height_loop:
-    SSE2_READ_UNA	xmm0, esi
-    SSE2_READ_UNA	xmm1, esi+eax
-    SSE2_WRITE_UNA	edi, xmm0
-    SSE2_WRITE_UNA	edi+edx, xmm1
-
-	sub		ecx,	2
-    lea     esi, [esi+eax*2]
-    lea     edi, [edi+edx*2]
-    jnz     .height_loop
-
-    pop     edi
-    pop     esi
-    ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ /dev/null
@@ -1,317 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-;							int32_t iSrcStride,
-;							uint8_t *pDst,
-;							int32_t iDstStride,
-;							uint8_t *pABCD,
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3
-	punpckhwd mm4, mm4
-
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-
-	mov esi, [esp +12+ 4]
-	mov eax, [esp + 12 + 8]
-	mov edi, [esp + 12 + 12]
-	mov edx, [esp + 12 + 16]
-    mov ecx, [esp + 12 + 24]
-
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0
-
-	movq mm0, mm2
-
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-;						int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						uint8_t *pABCD,
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-
-	mov esi, [esp +12+ 4]
-	mov eax, [esp + 12 + 8]
-	mov edi, [esp + 12 + 12]
-	mov edx, [esp + 12 + 16]
-    mov ecx, [esp + 12 + 24]
-
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0
-
-	movdqa xmm0, xmm2
-
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride,
-;                        uint8_t *pDst,
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]
-    punpcklwd xmm5, xmm5
-    punpckldq xmm5, xmm5
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6
-
-	mov eax, [esp + 12 + 4]
-	mov edx, [esp + 12 + 8]
-	mov esi, [esp + 12 + 12]
-	mov edi, [esp + 12 + 16]
-    mov ecx, [esp + 12 + 24]
-
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-
-.hloop_chroma:
-	lea	esi, [esi+2*edi]
-
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0
-
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4
-
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
--- a/codec/encoder/core/asm/mc_luma.asm
+++ /dev/null
@@ -1,1052 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_luma.asm
-;*
-;*  Abstract
-;*      sse2 motion compensation
-;*
-;*  History
-;*      17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro SSE_LOAD_8P 3
-	movq %1, %3
-	punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
-	paddw	%1, %6
-	movdqa	%8, %3
-	movdqa	%7, %2
-	paddw	%1, [h264_w0x10_1]
-	paddw	%8, %4
-	paddw	%7, %5
-	psllw	%8, 2
-	psubw	%8, %7
-	paddw	%1, %8
-	psllw	%8, 2
-	paddw	%1, %8
-	psraw   %1, 5
-	WELS_Zero %8
-	packuswb %1, %8
-	movq    %9, %1
-%endmacro
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
-;								int32_t iSrcStride,
-;								uint8_t *pDst,
-;								int32_t iDstStride,
-;								int32_t iHeight,
-;                      );
-;***********************************************************************
-McHorVer20WidthEq16_sse2:
-	push	esi
-	push	edi
-
-
-	mov esi, [esp + 12]
-	mov eax, [esp + 16]
-	mov edi, [esp + 20]
-	mov ecx, [esp + 28]
-	mov edx, [esp + 24]
-	sub esi, 2
-
-	WELS_Zero  xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [edi], xmm0
-
-	movq xmm0, [esi+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3+8]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [edi+8], xmm0
-
-
-	add esi, eax
-	add edi, edx
-	dec ecx
-	jnz .y_loop
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
-;									int32_t iSrcStride,
-;									uint8_t* pTap,
-;									int32_t iTapStride,
-;									int32_t iHeight);
-;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
-	push esi
-	push edi
-	push ebx
-	mov esi, [esp+16]     ;pSrc
-	mov eax, [esp+20]	;src_stride
-	mov edi, [esp+24]		;tap
-	mov edx, [esp+28]	;tap_stride
-	mov ebx, [esp+32]	;i_height
-	pxor xmm7, xmm7
-
-	sub esi, eax				;;;;;;;;need more 5 lines.
-	sub esi, eax
-
-.yloop_width_8:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [edi], xmm0
-
-	add esi, eax
-	add edi, edx
-	dec ebx
-	jnz .yloop_width_8
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;                       uint8_t *pDst,
-;                       int32_t iDstStride,
-;                       int32_t iHeight )
-;***********************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
-	push esi
-	push edi
-
-	mov esi, [esp + 12]
-	mov edx, [esp + 16]
-	mov edi, [esp + 20]
-	mov eax, [esp + 24]
-	mov ecx, [esp + 28]
-
-	sub esi, edx
-	sub esi, edx
-
-	WELS_Zero xmm7
-
-	SSE_LOAD_8P xmm0, xmm7, [esi]
-	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm2, xmm7, [esi]
-	SSE_LOAD_8P xmm3, xmm7, [esi+edx]
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm4, xmm7, [esi]
-	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm6, xmm7, [esi]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm0, xmm1, [esi]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm1, xmm2, [esi+edx]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm2, xmm3, [esi]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm3, xmm4, [esi+edx]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
-	dec ecx
-	jz near .xx_exit
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm4, xmm5, [esi]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
-	dec ecx
-	jz near .xx_exit
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
-	jmp near .start
-
-.xx_exit:
-	pop edi
-	pop esi
-	ret
-
-
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-ALIGN 16
-h264_w0x10_1:
-	dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
-	dw 32, 32, 32, 32, 32, 32, 32, 32
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20_sse2
-WELS_EXTERN McHorVer02_sse2
-WELS_EXTERN McHorVer22VerLastAlign_sse2
-WELS_EXTERN McHorVer22VerLastUnAlign_sse2
-WELS_EXTERN McHorVer22HorFirst_sse2
-
-
-;***********************************************************************
-; void McHorVer02_sse2(	uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;                       uint8_t *pDst,
-;                       int32_t iDstStride,
-;						int32_t iWidth,
-;                       int32_t iHeight )
-;***********************************************************************
-ALIGN 16
-McHorVer02_sse2:
-	push esi
-	push edi
-	push ebx
-
-	mov esi, [esp + 16]
-	mov edx, [esp + 20]
-	mov edi, [esp + 24]
-	mov eax, [esp + 28]
-	mov ecx, [esp + 36]
-	mov ebx, [esp + 32]
-	shr ebx, 3
-	sub esi, edx
-	sub esi, edx
-
-.xloop:
-	WELS_Zero xmm7
-	SSE_LOAD_8P xmm0, xmm7, [esi]
-	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm2, xmm7, [esi]
-	SSE_LOAD_8P xmm3, xmm7, [esi+edx]
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm4, xmm7, [esi]
-	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm6, xmm7, [esi]
-	movdqa xmm0,xmm1
-	movdqa xmm1,xmm2
-	movdqa xmm2,xmm3
-	movdqa xmm3,xmm4
-	movdqa xmm4,xmm5
-	movdqa xmm5,xmm6
-	add edi, eax
-	sub esi, edx
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm6, xmm7, [esi]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm0, xmm1, [esi]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm1, xmm2, [esi+edx]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm2, xmm3, [esi]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm3, xmm4, [esi+edx]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*edx]
-	SSE_LOAD_8P xmm4, xmm5, [esi]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*eax]
-	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
-	jmp near .start
-
-.x_loop_dec:
-	dec ebx
-	jz  near .xx_exit
-	mov esi, [esp + 16]
-	mov edi, [esp + 24]
-	sub esi, edx
-	sub esi, edx
-	add esi, 8
-	add edi, 8
-	mov ecx, [esp + 36]
-	jmp near .xloop
-
-.xx_exit:
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer20_sse2(		uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						int32_t iWidth,
-;						int32_t iHeight
-;                      );
-;***********************************************************************
-McHorVer20_sse2:
-	push esi
-	push edi
-	push ebx
-	mov esi, [esp+16]
-	mov eax, [esp+20]
-	mov edi, [esp+24]
-	mov edx, [esp+28]
-	mov ecx, [esp+32]
-	mov ebx, [esp+36]
-	sub esi, 2
-	pxor xmm7, xmm7
-
-	cmp ecx, 9
-	jne near .width_17
-
-.yloop_width_9:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [edi], xmm0
-
-	pxor  xmm7, xmm7
-	movq xmm0, [esi+6]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [edi+1], xmm2
-
-	add esi, eax
-	add edi, edx
-	dec ebx
-	jnz .yloop_width_9
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-.width_17:
-.yloop_width_17:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movq [edi], xmm0
-
-	movq xmm0, [esi+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3+8]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [edi+8], xmm0
-
-
-	pxor  xmm7, xmm7
-	movq xmm0, [esi+6+8]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [edi+9], xmm2
-	add esi, eax
-	add edi, edx
-	dec ebx
-	jnz .yloop_width_17
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;void McHorVer22HorFirst_sse2
-;							(uint8_t *pSrc,
-;							int32_t iSrcStride,
-;							uint8_t * pTap,
-;							int32_t iTapStride,
-;							int32_t iWidth,int32_t iHeight);
-;***********************************************************************
-McHorVer22HorFirst_sse2:
-	push esi
-	push edi
-	push ebx
-	mov esi, [esp+16]
-	mov eax, [esp+20]
-	mov edi, [esp+24]
-	mov edx, [esp+28]
-	mov ecx, [esp+32]
-	mov ebx, [esp+36]
-	pxor xmm7, xmm7
-
-	sub esi, eax				;;;;;;;;need more 5 lines.
-	sub esi, eax
-
-	cmp ecx, 9
-	jne near .width_17
-
-.yloop_width_9:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [edi], xmm0
-
-	pxor  xmm7, xmm7
-	movq xmm0, [esi+6]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [edi+2], xmm2
-	movhps [edi+2+8], xmm2
-
-	add esi, eax
-	add edi, edx
-	dec ebx
-	jnz .yloop_width_9
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-.width_17:
-.yloop_width_17:
-	movq xmm0, [esi]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [edi], xmm0
-
-	movq xmm0, [esi+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [esi+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [esi+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [esi+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [esi+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [esi+3+8]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [edi+16], xmm0
-
-
-	pxor  xmm7, xmm7
-	movq xmm0, [esi+6+8]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [edi+18], xmm2
-	movhps [edi+18+8], xmm2
-
-	add esi, eax
-	add edi, edx
-	dec ebx
-	jnz .yloop_width_17
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-%macro FILTER_VER 9
-	paddw  %1, %6
-	movdqa %7, %2
-	movdqa %8, %3
-
-
-	paddw %7, %5
-	paddw %8, %4
-
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %1, %8
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %8, %1
-	paddw  %8, [h264_mc_hc_32]
-	psraw   %8, 6
-	packuswb %8, %8
-	movq %9, %8
-%endmacro
-;***********************************************************************
-;void McHorVer22VerLastAlign_sse2(
-;											uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
- McHorVer22VerLastAlign_sse2:
-	push esi
-	push edi
-	push ebx
-	push ebp
-
-	mov esi, [esp+20]
-	mov eax, [esp+24]
-	mov edi, [esp+28]
-	mov edx, [esp+32]
-	mov ebx, [esp+36]
-	mov ecx, [esp+40]
-	shr ebx, 3
-
-.width_loop:
-	movdqa xmm0, [esi]
-	movdqa xmm1, [esi+eax]
-	lea esi, [esi+2*eax]
-	movdqa xmm2, [esi]
-	movdqa xmm3, [esi+eax]
-	lea esi, [esi+2*eax]
-	movdqa xmm4, [esi]
-	movdqa xmm5, [esi+eax]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	lea esi, [esi+2*eax]
-	movdqa xmm6, [esi]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add edi, edx
-	sub esi, eax
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm6, [esi]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm7, [esi+eax]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm0, [esi]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm1, [esi+eax]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm2, [esi]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm3, [esi+eax]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqa xmm4, [esi]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqa xmm5, [esi+eax]
-	jmp near .start
-
-.x_loop_dec:
-	dec ebx
-	jz near .exit
-	mov esi, [esp+20]
-	mov edi, [esp+28]
-	mov ecx, [esp+40]
-	add esi, 16
-	add edi, 8
-	jmp .width_loop
-
-
-
-.exit:
-	pop ebp
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-;***********************************************************************
-;void McHorVer22VerLastUnAlign_sse2(
-;											uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
- McHorVer22VerLastUnAlign_sse2:
-	push esi
-	push edi
-	push ebx
-	push ebp
-
-	mov esi, [esp+20]
-	mov eax, [esp+24]
-	mov edi, [esp+28]
-	mov edx, [esp+32]
-	mov ebx, [esp+36]
-	mov ecx, [esp+40]
-	shr ebx, 3
-
-.width_loop:
-	movdqu xmm0, [esi]
-	movdqu xmm1, [esi+eax]
-	lea esi, [esi+2*eax]
-	movdqu xmm2, [esi]
-	movdqu xmm3, [esi+eax]
-	lea esi, [esi+2*eax]
-	movdqu xmm4, [esi]
-	movdqu xmm5, [esi+eax]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	lea esi, [esi+2*eax]
-	movdqu xmm6, [esi]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add edi, edx
-	sub esi, eax
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqu xmm6, [esi]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqu xmm7, [esi+eax]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqu xmm0, [esi]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqu xmm1, [esi+eax]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqu xmm2, [esi]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqu xmm3, [esi+eax]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea esi, [esi+2*eax]
-	movdqu xmm4, [esi]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
-	dec ecx
-	jz near .x_loop_dec
-
-	lea edi, [edi+2*edx]
-	movdqu xmm5, [esi+eax]
-	jmp near .start
-
-.x_loop_dec:
-	dec ebx
-	jz near .exit
-	mov esi, [esp+20]
-	mov edi, [esp+28]
-	mov ecx, [esp+40]
-	add esi, 16
-	add edi, 8
-	jmp .width_loop
-
-
-
-.exit:
-	pop ebp
-	pop ebx
-	pop edi
-	pop esi
-	ret
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
+;*      
 ;*
-;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -40,15 +40,13 @@
 ;*
 ;*************************************************************************/
 
-BITS 32
-
 %include "asm_inc.asm"
 ;***********************************************************************
 ; Code
 ;***********************************************************************
 
-SECTION .text
-
+SECTION .text			
+		
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -55,9 +53,11 @@
 ;***********************************************************************
 WELS_EXTERN WelsPrefetchZero_mmx
 WelsPrefetchZero_mmx:
-	mov  eax,[esp+4]
-	prefetchnta [eax]
-	ret
+	%assign  push_num 0
+	LOAD_1_PARA
+	;mov  eax,[esp+4]
+	prefetchnta [r0]
+	ret 			
 
 
 ALIGN 16
@@ -66,23 +66,25 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroAligned64_sse2
 WelsSetMemZeroAligned64_sse2:
-		mov		eax,	[esp + 4]          ; dst
-		mov		ecx,	[esp + 8]
-		neg		ecx
 
+		%assign  push_num 0
+		LOAD_2_PARA
+		SIGN_EXTENTION r1, r1d
+		neg		r1
+			
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
-		movdqa	[eax],		xmm0
-		movdqa	[eax+16],	xmm0
-		movdqa	[eax+32],	xmm0
-		movdqa	[eax+48],	xmm0
-		add		eax, 0x40
-
-		add ecx, 0x40
+		movdqa	[r0],		xmm0
+		movdqa	[r0+16],	xmm0
+		movdqa	[r0+32],	xmm0
+		movdqa	[r0+48],	xmm0
+		add		r0, 0x40
+		
+		add r1, 0x40
 		jnz near .memzeroa64_sse2_loops
+			
+		ret	
 
-		ret
-
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -89,47 +91,51 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize64_mmx
 WelsSetMemZeroSize64_mmx:
-		mov		eax,	[esp + 4]          ; dst
-		mov		ecx,	[esp + 8]
-		neg		ecx
 
+		%assign  push_num 0
+		LOAD_2_PARA
+		SIGN_EXTENTION r1, r1d
+		neg		r1
+			
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
-		movq	[eax],		mm0
-		movq	[eax+8],	mm0
-		movq	[eax+16],	mm0
-		movq	[eax+24],	mm0
-		movq	[eax+32],	mm0
-		movq	[eax+40],	mm0
-		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0
-		add		eax,		0x40
-
-		add ecx, 0x40
+		movq	[r0],		mm0
+		movq	[r0+8],	mm0
+		movq	[r0+16],	mm0
+		movq	[r0+24],	mm0
+		movq	[r0+32],	mm0
+		movq	[r0+40],	mm0
+		movq	[r0+48],	mm0
+		movq	[r0+56],	mm0		
+		add		r0,		0x40
+		
+		add r1, 0x40
 		jnz near .memzero64_mmx_loops
-
-		WELSEMMS
-		ret
-
-ALIGN 16
+			
+		WELSEMMS	
+		ret	
+	
+ALIGN 16		
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize8_mmx
 WelsSetMemZeroSize8_mmx:
-		mov		eax,	[esp + 4]		; dst
-		mov		ecx,	[esp + 8]		; size
-		neg		ecx
-		pxor	mm0,		mm0
 
+		%assign  push_num 0
+		LOAD_2_PARA
+		SIGN_EXTENTION r1, r1d
+		neg		r1			
+		pxor	mm0,		mm0
+		
 .memzero8_mmx_loops:
-		movq	[eax],		mm0
-		add		eax,		0x08
-
-		add		ecx,		0x08
+		movq	[r0],		mm0
+		add		r0,		0x08
+	
+		add		r1,		0x08
 		jnz near .memzero8_mmx_loops
+		
+		WELSEMMS	
+		ret	
 
-		WELSEMMS
-		ret
-
-
+							
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -42,7 +42,6 @@
 
 %include "asm_inc.asm"
 
-BITS 32
 
 SECTION .text
 ;************************************************
@@ -86,14 +85,16 @@
 WELS_EXTERN WelsQuant4x4_sse2
 align 16
 WelsQuant4x4_sse2:
-		mov		eax,  [ff]
-		mov		ecx,  [mf]
-		MOVDQ	xmm2, [eax]
-		MOVDQ	xmm3, [ecx]
+		%assign push_num 0
+                LOAD_3_PARA
+		;mov		eax,  [ff]
+		;mov		ecx,  [mf]
+		movdqa	xmm2, [r1]
+		movdqa	xmm3, [r2]
 
-		mov		edx,  [pDct]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+		;mov		edx,  [pDct]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
 
 		ret
 
@@ -103,15 +104,21 @@
 WELS_EXTERN WelsQuant4x4Dc_sse2
 align 16
 WelsQuant4x4Dc_sse2:
-		mov		ax,		[mf]
-		SSE2_Copy8Times xmm3, eax
+ 		%assign push_num 0
+		LOAD_3_PARA
+		%ifndef X86_32
+		movsx r1, r1w
+		movsx r2, r2w
+		%endif
+		;mov		ax,		[mf]
+		SSE2_Copy8Times xmm3, r2d
 
-		mov		cx, [ff]
-		SSE2_Copy8Times xmm2, ecx
+		;mov		cx, [ff]
+		SSE2_Copy8Times xmm2, r1d
 
-		mov		edx,  [pDct]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+		;mov		edx,  [pDct]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
 
 		ret
 
@@ -121,20 +128,22 @@
 WELS_EXTERN WelsQuantFour4x4_sse2
 align 16
 WelsQuantFour4x4_sse2:
-		mov		eax,  [ff]
-		mov		ecx,  [mf]
-		MOVDQ	xmm2, [eax]
-		MOVDQ	xmm3, [ecx]
+		%assign push_num 0
+		LOAD_3_PARA
+		;mov		eax,  [ff]
+		;mov		ecx,  [mf]
+		MOVDQ	xmm2, [r1]
+		MOVDQ	xmm3, [r2]
 
-		mov		edx,  [pDct]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
+		;mov		edx,  [pDct]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
 
 		ret
 
@@ -144,24 +153,26 @@
 WELS_EXTERN WelsQuantFour4x4Max_sse2
 align 16
 WelsQuantFour4x4Max_sse2:
-		mov		eax,  [ff]
-		mov		ecx,  [mf]
-		MOVDQ	xmm2, [eax]
-		MOVDQ	xmm3, [ecx]
+		%assign push_num 0
+		LOAD_4_PARA
+		;mov		eax,  [ff]
+		;mov		ecx,  [mf]
+		MOVDQ	xmm2, [r1]
+		MOVDQ	xmm3, [r2]
 
-		mov		edx,  [pDct]
+		;mov		edx,  [pDct]
 		pxor	xmm4, xmm4
 		pxor	xmm5, xmm5
 		pxor	xmm6, xmm6
 		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0	  ], xmm4
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
 
 		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
 		pmaxsw  xmm0,  xmm4
@@ -171,9 +182,9 @@
 		punpckhqdq	xmm0, xmm1
 		pmaxsw	xmm0, xmm1
 
-		mov		edx,  [max]
-		movq	[edx], xmm0
-
+		;mov		r0,  [r3]
+		movq	[r3], xmm0
+		LOAD_4_PARA_POP
 		ret
 
 %macro  MMX_Copy4Times 2
@@ -203,21 +214,20 @@
 WELS_EXTERN WelsHadamardQuant2x2_mmx
 align 16
 WelsHadamardQuant2x2_mmx:
-
-		mov			eax,			[pDct]
-		movd		mm0,			[eax]
-		movd		mm1,			[eax + 0x20]
+		%assign push_num 0
+		LOAD_5_PARA
+		%ifndef X86_32
+		movsx r1, r1w
+		movsx r2, r2w
+		%endif
+		;mov			eax,			[pDct]
+		movd		mm0,			[r0]
+		movd		mm1,			[r0 + 0x20]
 		punpcklwd	mm0,			mm1
-		movd		mm3,			[eax + 0x40]
-		movd		mm1,			[eax + 0x60]
+		movd		mm3,			[r0 + 0x40]
+		movd		mm1,			[r0 + 0x60]
 		punpcklwd	mm3,			mm1
 
-		mov			cx,				0
-		mov			[eax],			cx
-		mov			[eax + 0x20],	cx
-		mov			[eax + 0x40],	cx
-		mov			[eax + 0x60],	cx
-
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -231,17 +241,17 @@
 		punpcklwd	mm1,			mm3
 
 		;quant_2x2_dc
-		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax
-		mov			cx,				[ff]
-		MMX_Copy4Times	mm2,		ecx
+		;mov			ax,				[mf]
+		MMX_Copy4Times	mm3,		r2d
+		;mov			cx,				[ff]
+		MMX_Copy4Times	mm2,		r1d
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
 
 		; store dct_2x2
-		mov			edx,			[dct2x2]
-		movq		[edx],			mm1
-		mov			ecx,			[iChromaDc]
-		movq		[ecx],			mm1
+		;mov			edx,			[dct2x2]
+		movq		[r3],			mm1
+		;mov			ecx,			[iChromaDc]
+		movq		[r4],			mm1
 
 		; pNonZeroCount of dct_2x2
 		pcmpeqb		mm2,			mm2		; mm2 = FF
@@ -250,9 +260,17 @@
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
-		movd		eax,			mm1
+		mov			r1w,				0
+		mov			[r0],			r1w
+		mov			[r0 + 0x20],	r1w
+		mov			[r0 + 0x40],	r1w
+		mov			[r0 + 0x60],	r1w
 
+
+		movd		retrd,		mm1
+
 		WELSEMMS
+		LOAD_5_PARA_POP
 		ret
 
 ;***********************************************************************
@@ -261,13 +279,18 @@
 WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
 align 16
 WelsHadamardQuant2x2Skip_mmx:
-
-		mov			eax,			[pDct]
-		movd		mm0,			[eax]
-		movd		mm1,			[eax + 0x20]
+		%assign push_num 0
+		LOAD_3_PARA
+		%ifndef X86_32
+		movsx r1, r1w
+		movsx r2, r2w
+		%endif
+		;mov			eax,			[pDct]
+		movd		mm0,			[r0]
+		movd		mm1,			[r0 + 0x20]
 		punpcklwd	mm0,			mm1
-		movd		mm3,			[eax + 0x40]
-		movd		mm1,			[eax + 0x60]
+		movd		mm3,			[r0 + 0x40]
+		movd		mm1,			[r0 + 0x60]
 		punpcklwd	mm3,			mm1
 
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
@@ -283,10 +306,10 @@
 		punpcklwd	mm1,			mm3
 
 		;quant_2x2_dc
-		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax
-		mov			cx,				[ff]
-		MMX_Copy4Times	mm2,		ecx
+		;mov			ax,				[mf]
+		MMX_Copy4Times	mm3,		r2d
+		;mov			cx,				[ff]
+		MMX_Copy4Times	mm2,		r1d
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
 
 		; pNonZeroCount of dct_2x2
@@ -296,7 +319,7 @@
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
-		movd		eax,			mm1
+		movd		retrd,			mm1
 
 		WELSEMMS
 		ret
@@ -317,12 +340,14 @@
 WELS_EXTERN WelsDequant4x4_sse2
 WelsDequant4x4_sse2:
 	;ecx = dequant_mf[qp], edx = pDct
-	mov		ecx,  [esp + 8]
-	mov		edx,  [esp + 4]
+	%assign push_num 0
+	LOAD_2_PARA
+	;mov		ecx,  [esp + 8]
+	;mov		edx,  [esp + 4]
 
-	movdqa  xmm1, [ecx]
-	SSE2_DeQuant8 [edx		],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x10	],  xmm0, xmm1
+	movdqa  xmm1, [r1]
+	SSE2_DeQuant8 [r0	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
 
     ret
 
@@ -335,18 +360,20 @@
 WELS_EXTERN WelsDequantFour4x4_sse2
 WelsDequantFour4x4_sse2:
     ;ecx = dequant_mf[qp], edx = pDct
-	mov		ecx,  [esp + 8]
-	mov		edx,  [esp + 4]
+	%assign push_num 0
+	LOAD_2_PARA
+	;mov		ecx,  [esp + 8]
+	;mov		edx,  [esp + 4]
 
-	movdqa  xmm1, [ecx]
-	SSE2_DeQuant8 [edx		],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x10	],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x20	],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x30	],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x40	],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x50	],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x60	],  xmm0, xmm1
-	SSE2_DeQuant8 [edx+0x70	],  xmm0, xmm1
+	movdqa  xmm1, [r1]
+	SSE2_DeQuant8 [r0	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x10	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x20	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x30	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x40	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x50	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x60	],  xmm0, xmm1
+	SSE2_DeQuant8 [r0+0x70	],  xmm0, xmm1
 
     ret
 
@@ -356,14 +383,19 @@
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
 align 16
 WelsDequantIHadamard4x4_sse2:
-		mov			eax,			[esp + 4]
-		mov			cx,				[esp + 8]
+		%assign push_num 0
+		LOAD_2_PARA
+		%ifndef X86_32
+		movzx r1, r1w
+		%endif
+		;mov			eax,			[esp + 4]
+		;mov			cx,				[esp + 8]
 
 		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		ecx
+		SSE2_Copy8Times	xmm1,		r1d
 		;psrlw		xmm1,		2		; for the (>>2) in ihdm
-		MOVDQ		xmm0,		[eax]
-		MOVDQ		xmm2,		[eax+0x10]
+		MOVDQ		xmm0,		[r0]
+		MOVDQ		xmm2,		[r0+0x10]
 		pmullw		xmm0,		xmm1
 		pmullw		xmm2,		xmm1
 
@@ -386,8 +418,8 @@
 		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
 
 		punpcklqdq	xmm0,		xmm1
-		MOVDQ		[eax],		xmm0
+		MOVDQ		[r0],		xmm0
 
 		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[eax+16],	xmm2
+		MOVDQ		[r0+16],	xmm2
 		ret
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -1,2189 +1,2344 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  satd_sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSatd4x4_sse2
-;*      WelsSampleSatd8x8_sse2
-;*      WelsSampleSatd16x8_sse2
-;*      WelsSampleSatd8x16_sse2
-;*      WelsSampleSatd16x16_sse2
-;*
-;*      WelsSampleSad16x8_sse2
-;*      WelsSampleSad16x16_sse2
-;*
-;*  History
-;*      8/5/2009 Created
-;*     24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1:  dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2:  dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
-%endmacro
-
-%macro  SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
-%endmacro
-
-%macro  SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	lea                 eax, [eax+2*ebx]
-	lea                 ecx, [ecx+2*edx]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
-	lea					eax,    [eax+2*ebx]
-    lea					ecx,    [ecx+2*edx]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	lea                 eax, [eax+2*ebx]
-	lea                 ecx, [ecx+2*edx]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
-	push      ebx
-	mov       eax,  [esp+8]
-	mov       ebx,  [esp+12]
-	mov       ecx,  [esp+16]
-	mov       edx,  [esp+20]
-
-    movd      xmm0, [eax]
-    movd      xmm1, [eax+ebx]
-    lea       eax , [eax+2*ebx]
-    movd      xmm2, [eax]
-    movd      xmm3, [eax+ebx]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    movd      xmm4, [ecx]
-    movd      xmm5, [ecx+edx]
-    lea       ecx , [ecx+2*edx]
-    movd      xmm6, [ecx]
-    movd      xmm7, [ecx+edx]
-    punpckldq xmm4, xmm6
-    punpckldq xmm5, xmm7
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-    punpcklbw xmm4, xmm6
-    punpcklbw xmm5, xmm6
-
-    psubw     xmm0, xmm4
-    psubw     xmm1, xmm5
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
-    movdqa     xmm4, xmm0
-    paddw      xmm0, xmm3
-    psubw      xmm4, xmm3
-
-    movdqa         xmm2, xmm0
-    punpcklwd      xmm0, xmm4
-    punpckhwd      xmm4, xmm2
-
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
-
-    movdqa         xmm7, xmm0
-    paddw          xmm0, xmm5
-    psubw          xmm7, xmm5
-
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
-
-    movdqa         xmm2, xmm0
-    paddw          xmm0, xmm1
-    psubw          xmm2, xmm1
-
-    WELS_AbsW  xmm0, xmm3
-    paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
-    paddusw        xmm6, xmm2
-    SSE2_SumWHorizon1  xmm6, xmm4
-	movd           eax,  xmm6
-    and            eax,  0xffff
-    shr            eax,  1
-	pop            ebx
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
-	 push   ebx
-	 mov    eax,    [esp+8]
-	 mov    ebx,    [esp+12]
-	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
-     SSE2_GetSatd8x8
-     psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    eax,   xmm6
-	 pop     ebx
-	 ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
-	 push   ebx
-	 mov    eax,    [esp+8]
-	 mov    ebx,    [esp+12]
-	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
-
-	 SSE2_GetSatd8x8
-     lea    eax,    [eax+2*ebx]
-     lea    ecx,    [ecx+2*edx]
-	 SSE2_GetSatd8x8
-
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    eax,   xmm6
-	 pop     ebx
-	 ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
-	push   ebx
-	mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	mov    eax,    [esp+8]
-    mov    ecx,    [esp+16]
-    add    eax,    8
-    add    ecx,    8
-	SSE2_GetSatd8x8
-
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    eax,   xmm6
-	pop     ebx
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
-	push   ebx
-	mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	SSE2_GetSatd8x8
-
-	mov    eax,    [esp+8]
-	mov    ecx,    [esp+16]
-	add    eax,    8
-	add    ecx,    8
-
-	SSE2_GetSatd8x8
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    eax,   xmm6
-	pop     ebx
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [eax]
-	movq        xmm1,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	movq        xmm2,   [eax]
-	movq        xmm3,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2],   0
-	pinsrw      xmm0,   word[esi+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+2],  0
-	pinsrw      xmm0,   word[esi+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+4],  0
-	pinsrw      xmm0,   word[esi+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+6],  0
-	pinsrw      xmm0,   word[esi+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [esi+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [esi+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
-%endmacro
-
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movdqu 		xmm0,   [ecx]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     8
-	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     10
-	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     12
-	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     14
-	pinsrb      xmm0,   byte[ecx+edx-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi+32], xmm0 ;H
-	movdqa      [esi+48], xmm1
-	movd        ecx,    xmm4 ;dc
-	add         ecx,    16   ;(sum+16)
-	shr         ecx,    5    ;((sum+16)>>5)
-	shl         ecx,    4    ;
-	movd        mm4,    ecx  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-	mov         edi,    0
-.loop16x16_get_satd:
-.loopStart1:
-	SSE41_I16x16GetX38x4Satd ecx, edi
-	inc          ecx
-	cmp         ecx, 4
-	jl          .loopStart1
-	cmp         edi, 16
-	je          .loop16x16_get_satd_end
-	mov         eax, [esp+24]
-	add         eax, 8
-	mov         ecx, 0
-	add         edi, 16
-	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov      edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ebx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_16x16
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16
-
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_16x16
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-return_satd_intra_16x16_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movq 		xmm0,   [ecx]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [esi],  xmm0 ;V
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [esi+16], xmm0 ;H
-;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
-;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
-;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [esi+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [esi+48], xmm5
-
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:
-	SSE41_ChromaGetX38x4Satd ecx, 0
-	inc             ecx
-	cmp             ecx, 2
-	jl              loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
-%endmacro
-%macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	xor    edi,    edi
-loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	cmp             edi, 1
-	je              loop_chroma_satdx3end
-	inc             edi
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov         ecx,  [esp+44]
-	mov         eax,  [esp+48]
-	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
-
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov       edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ecx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_8x8
-	cmp        ebx, ecx
-	jge near   not_dc_h_8x8
-
-	; for DC mode
-	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_8x8
-	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
-	; for V mode
-	mov       dword[edx], 2;I8_PRED_V
-	mov       eax, ecx
-return_satd_intra_8x8_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
-    movzx   %2, byte %1
-    mov    %3, %2
-    add     %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    edi,    [esp+40] ;temp_sad
-	sub    ecx,    edx
-    movdqa      xmm5,[ecx]
-    pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5
-    movhlps     xmm1,xmm0
-    paddw       xmm0,xmm1
-    movd        eax,xmm0
-
-    add         ecx,edx
-    lea         ebx, [edx+2*edx]
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    sub        edi, 192
-    add         eax,10h
-    shr         eax,5
-    movd        xmm7,eax
-    pxor        xmm1,xmm1
-    pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4
-    pxor        xmm3,xmm3
-    pxor        xmm2,xmm2
-;sad begin
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-    lea         esi, [ebx+2*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
-    pslldq      xmm3,4
-    por         xmm3,xmm2
-    movhlps     xmm1,xmm3
-    paddw       xmm3,xmm1
-    movhlps     xmm0,xmm4
-    paddw       xmm4,xmm0
-; comparing order: DC H V
-	movd        ebx, xmm4 ;DC
-	movd        ecx, xmm3 ;V
-	psrldq      xmm3, 4
-	movd        esi, xmm3 ;H
-	mov         eax, [esp+36] ;lamda
-	shl         eax, 1
-	add         esi, eax
-	add         ebx, eax
-	mov         edx, [esp+32]
-	cmp         ebx, esi
-	jge near   not_dc_16x16_sad
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-    sub        edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm7
-%assign x x+1
-%endrep
-	jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
-	; for H mode
-	cmp       esi, ecx
-	jge near   not_dc_h_16x16_sad
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi
-	jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-    sub       edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
-	pop    edi
-	pop    esi
-	pop    ebx
-	ret
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
-	movq             xmm0, [eax]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [eax+ebx]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [ecx]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [ecx+edx]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [eax+2*ebx]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [eax+esi]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [ecx+2*edx]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [ecx+edi]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
-	push        ebx
-	mov         eax,[esp+8]
-	mov         ebx,[esp+12]
-	mov         ecx,[esp+16]
-	mov         edx,[esp+20]
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[ecx]
-	movd        xmm5,[ecx+edx]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[ecx+edx*2]
-	lea         ecx, [edx*2+ecx]
-	movd        xmm5,[ecx+edx]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[eax]
-	movd        xmm5,[eax+ebx]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[eax+ebx*2]
-	lea         eax, [ebx*2+eax]
-	movd        xmm5,[eax+ebx]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
-	pop         ebx
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    eax,    [esp+16]
-	mov    ebx,    [esp+20]
-	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]
-	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2]
-	lea         edi,  [edx+edx*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			eax,	[eax+4*ebx]
-	lea			ecx,    [ecx+4*edx]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
-	pop 		edi
-	pop 		esi
-	pop 		ebx
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	push   ebp
-%define pushsize   16
-	mov    eax,    [esp+pushsize+4]
-	mov    ebx,    [esp+pushsize+8]
-	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]
-	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2]
-	lea         edi,  [edx+edx*2]
-	pxor        xmm6, xmm6
-	mov         ebp,    0
-loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			eax,  [eax+4*ebx]
-	lea			ecx,  [ecx+4*edx]
-	inc         ebp
-	cmp         ebp,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
-	pop         ebp
-	pop 		edi
-	pop 		esi
-	pop 		ebx
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	mov    eax,    [esp+16]
-	mov    ebx,    [esp+20]
-	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]
-	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2]
-	lea         edi,  [edx+edx*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			eax,  [eax+4*ebx]
-	lea			ecx,  [ecx+4*edx]
-	SSE41_GetSatd8x4
-	mov			eax,    [esp+16]
-	mov			ecx,    [esp+24]
-	add			eax,    8
-	add			ecx,    8
-	SSE41_GetSatd8x4
-	lea			eax,    [eax+4*ebx]
-	lea			ecx,    [ecx+4*edx]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
-	pop 		edi
-	pop 		esi
-	pop 		ebx
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
-	push   ebx
-	push   esi
-	push   edi
-	push   ebp
-	%define pushsize   16
-	mov    eax,    [esp+pushsize+4]
-	mov    ebx,    [esp+pushsize+8]
-	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]
-	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2]
-	lea         edi,  [edx+edx*2]
-	pxor		xmm6,   xmm6
-	mov         ebp,    0
-loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			eax,  [eax+4*ebx]
-	lea			ecx,  [ecx+4*edx]
-	inc         ebp
-	cmp         ebp,  4
-	jl          loop_get_satd_16x16_left
-	mov			eax,    [esp+pushsize+4]
-	mov			ecx,    [esp+pushsize+12]
-	add			eax,    8
-	add			ecx,    8
-	mov         ebp,    0
-loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			eax,  [eax+4*ebx]
-	lea			ecx,  [ecx+4*edx]
-	inc         ebp
-	cmp         ebp,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
-	%undef pushsize
-	pop         ebp
-	pop 		edi
-	pop 		esi
-	pop 		ebx
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqu xmm1,   [ecx]
-	MOVDQ  xmm2,   [eax];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [ecx+edx]
-	MOVDQ  xmm2,   [eax+ebx]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [ecx]
-	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [ecx+edx]
-	MOVDQ  xmm2,   [eax+ebx]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [ecx+2*edx]
-	MOVDQ  xmm2,   [eax+2*ebx];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [ecx+edi]
-	MOVDQ  xmm2,   [eax+esi]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
-	push ebx
-	push edi
-	push esi
-
-	%define _STACK_SIZE		12
-
-	mov eax, [esp+_STACK_SIZE+4 ]
-	mov	ebx, [esp+_STACK_SIZE+8 ]
-	lea esi, [3*ebx]
-	mov ecx, [esp+_STACK_SIZE+12]
-	mov edx, [esp+_STACK_SIZE+16]
-	lea edi, [3*edx]
-
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea   eax,    [eax+4*ebx]
-	lea   ecx,    [ecx+4*edx]
-	SSE2_GetSad4x16
-	lea   eax,    [eax+4*ebx]
-	lea   ecx,    [ecx+4*edx]
-	SSE2_GetSad4x16
-	lea   eax,    [eax+4*ebx]
-	lea   ecx,    [ecx+4*edx]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd eax, xmm0
-
-	%undef _STACK_SIZE
-
-	pop esi
-	pop edi
-	pop ebx
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
-	push   ebx
-	mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	movdqu xmm0,   [ecx]
-	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [ecx+edx]
-	MOVDQ  xmm2,   [eax+ebx]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        eax,  xmm0
-	pop         ebx
-	ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
-	push   ebx
-	mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-    pxor   xmm6,   xmm6
-
-	SSE2_GetSad8x4
-    lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4
-    lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	SSE2_GetSad8x4
-    lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4
-
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       eax,  xmm0
-	pop        ebx
-	ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    mov    ecx,    [esp+12]
-	mov    edx,    ecx
-    CACHE_SPLIT_CHECK edx, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	push   ebx
-	push   edi
-	mov    eax,    [esp+12]
-	mov    ebx,    [esp+16]
-
-    pxor   xmm7,   xmm7
-
-    mov    edi,    ecx
-    and    edi,    0x07
-    sub    ecx,    edi
-    mov    edx,    8
-    sub    edx,    edi
-
-    shl    edi,    3
-    shl    edx,    3
-    movd   xmm5,   edi
-    movd   xmm6,   edx
-	mov    edi,    8
-	add    edi,    ecx
-    mov    edx,    [esp+24]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       eax,  xmm0
-	pop        edi
-	jmp        .return
-.pixel_sad_8x8_nsplit:
-    push   ebx
-    mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       eax,  xmm0
-.return:
-	pop        ebx
-	ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
-	push ebx
-	mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [eax]
-	sub    ecx,    edx
-	movdqu xmm3,   [ecx]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movdqa xmm1,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
-
-	movdqu xmm2,   [ecx+edx-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
-
-	movdqu xmm3,   [ecx+edx+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm2,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
-	movdqa xmm0,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm1,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
-	movdqa xmm2,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm0,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
-	movdqa xmm1,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm2,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
-	movdqa xmm0,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm1,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
-	movdqa xmm2,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm0,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
-	movdqa xmm1,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	movdqa xmm2,   [eax]
-	movdqu xmm3,   [ecx]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
-	movdqa xmm0,   [eax+ebx]
-	movdqu xmm3,   [ecx+edx]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
-	lea    ecx,    [ecx+2*edx]
-	movdqu xmm3,   [ecx]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
-
-	movdqu xmm2,   [ecx-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [ecx+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	mov        ecx,  [esp+24]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [ecx],xmm4
-	pop  ebx
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
-	push ebx
-	push edi
-	mov    eax,    [esp+12]
-	mov    ebx,    [esp+16]
-	mov    edi,    [esp+20]
-	mov    edx,    [esp+24]
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [eax]
-	sub    edi,    edx
-	movdqu xmm3,   [edi]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
-
-	movdqa xmm1,   [eax+ebx]
-	movdqu xmm3,   [edi+edx]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
-
-	movdqu xmm2,   [edi+edx-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [edi+edx+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	lea    eax,    [eax+2*ebx]
-	lea    edi,    [edi+2*edx]
-	movdqa xmm2,   [eax]
-	movdqu xmm3,   [edi]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi
-	movdqa xmm0,   [eax+ebx]
-	movdqu xmm3,   [edi+edx]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi+edx
-	lea    eax,    [eax+2*ebx]
-	lea    edi,    [edi+2*edx]
-	movdqa xmm1,   [eax]
-	movdqu xmm3,   [edi]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi
-	movdqa xmm2,   [eax+ebx]
-	movdqu xmm3,   [edi+edx]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi+edx
-	lea    eax,    [eax+2*ebx]
-	lea    edi,    [edi+2*edx]
-	movdqa xmm0,   [eax]
-	movdqu xmm3,   [edi]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi
-	movdqa xmm1,   [eax+ebx]
-	movdqu xmm3,   [edi+edx]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi+edx
-	lea    edi,    [edi+2*edx]
-	movdqu xmm3,   [edi]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	movdqu xmm0,   [edi-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
-
-	movdqu xmm3,   [edi+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [edi+edx]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
-
-	mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [edi],xmm4
-	pop  edi
-	pop  ebx
-	ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
-	push ebx
-	push edi
-	mov    eax,    [esp+12]
-	mov    ebx,    [esp+16]
-	mov    edi,    [esp+20]
-	mov    edx,    [esp+24]
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-	sub    edi,    edx
-	movq   xmm3,   [edi]
-	movhps xmm3,   [edi+edx]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [edi],xmm4
-	pop  edi
-	pop  ebx
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
-	push ebx
-	push edi
-	mov    eax,    [esp+12]
-	mov    ebx,    [esp+16]
-	mov    edi,    [esp+20]
-	mov    edx,    [esp+24]
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-	sub    edi,    edx
-	movq   xmm3,   [edi]
-	movhps xmm3,   [edi+edx]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [eax]
-	movhps xmm0,  [eax+ebx]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-
-	movq   xmm1,  [edi+edx-1]
-	movq   xmm3,  [edi+edx+1]
-
-	lea    eax,   [eax+2*ebx]
-	lea    edi,   [edi+2*edx]
-	movhps xmm1,  [edi-1]
-	movhps xmm3,  [edi+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [edi]
-	movhps xmm3,  [edi+edx]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	mov        edi,  [esp+28]
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [edi],xmm4
-	pop  edi
-	pop  ebx
-	ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
-	push ebx
-	push edi
-	mov    eax,    [esp+12]
-	mov    ebx,    [esp+16]
-	mov    edi,    [esp+20]
-	mov    edx,    [esp+24]
-	movd   xmm0,   [eax]
-	movd   xmm1,   [eax+ebx]
-	lea        eax,    [eax+2*ebx]
-	movd       xmm2,   [eax]
-	movd       xmm3,   [eax+ebx]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        edi,  edx
-	movd       xmm1, [edi]
-	movd       xmm2, [edi+edx]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [edi+edx-1]
-	movd       xmm3, [edi+edx+1]
-
-	lea        edi,  [edi+2*edx]
-
-	movd       xmm4, [edi]
-	movd       xmm5, [edi-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [edi+1]
-	punpckldq  xmm3, xmm5
-
-	movd       xmm5, [edi+edx]
-	punpckldq  xmm4, xmm5
-
-	punpcklqdq xmm1, xmm4 ;-L
-
-	movd       xmm5, [edi+edx-1]
-	movd       xmm6, [edi+edx+1]
-
-	lea        edi,  [edi+2*edx]
-	movd       xmm7, [edi-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [edi+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [edi]
-	movd       xmm7, [edi+edx]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
-
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	mov        edi,  [esp+28]
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [edi],xmm1
-	pop  edi
-	pop  ebx
-	ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
-    push    ebx
-%define pushsize     4
-%define pix1address	 esp+pushsize+4
-%define pix1stride   esp+pushsize+8
-%define pix2address  esp+pushsize+12
-%define pix2stride   esp+pushsize+16
-
-    mov		  eax, [pix1address]
-    mov		  ebx, [pix1stride ]
-    mov		  ecx, [pix2address]
-    mov		  edx, [pix2stride ]
-
-	movd	  mm0, [eax]
-	movd	  mm1, [eax+ebx]
-	punpckldq mm0, mm1
-
-	movd      mm3, [ecx]
-	movd      mm4, [ecx+edx]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
-
-	lea       eax, [eax+2*ebx]
-	lea       ecx, [ecx+2*edx]
-
-	movd      mm1, [eax]
-	movd      mm2, [eax+ebx]
-	punpckldq mm1, mm2
-
-	movd      mm3, [ecx]
-	movd      mm4, [ecx+edx]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
-
-    movd      eax, mm0
-
-	WELSEMMS
-    pop ebx
-%undef pushsize
-%undef pix1address
-%undef pix1stride
-%undef pix2address
-%undef pix2stride
-    ret
\ No newline at end of file
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  satd_sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSatd4x4_sse2
+;*      WelsSampleSatd8x8_sse2
+;*      WelsSampleSatd16x8_sse2
+;*      WelsSampleSatd8x16_sse2
+;*      WelsSampleSatd16x16_sse2
+;*
+;*      WelsSampleSad16x8_sse2
+;*      WelsSampleSad16x16_sse2
+;*
+;*  History
+;*      8/5/2009 Created
+;*     24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1:  dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2:  dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubw %1, %2
+%endmacro
+
+%macro  SSE2_SumWHorizon1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
+   SSE2_SumSub %1, %2, %5
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+	WELS_AbsW %1, %3
+	WELS_AbsW %2, %3
+	WELS_AbsW %4, %6
+	WELS_AbsW %5, %6
+	paddusw       %1, %2
+	paddusw       %4, %5
+	paddusw       %7, %1
+	paddusw       %7, %4
+%endmacro
+
+%macro  SSE2_SumWHorizon 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+	lea					r0,    [r0+2*r1]
+    lea					r2,    [r2+2*r3]
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+	;push      ebx
+	;mov       eax,  [esp+8]
+	;mov       ebx,  [esp+12]
+	;mov       ecx,  [esp+16]
+	;mov       edx,  [esp+20]
+	
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    movd      xmm0, [r0]
+    movd      xmm1, [r0+r1]
+    lea       r0 , [r0+2*r1]
+    movd      xmm2, [r0]
+    movd      xmm3, [r0+r1]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    movd      xmm4, [r2]
+    movd      xmm5, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm6, [r2]
+    movd      xmm7, [r2+r3]
+    punpckldq xmm4, xmm6
+    punpckldq xmm5, xmm7
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    punpcklbw xmm4, xmm6
+    punpcklbw xmm5, xmm6
+
+    psubw     xmm0, xmm4
+    psubw     xmm1, xmm5
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+    movdqa     xmm4, xmm0
+    paddw      xmm0, xmm3
+    psubw      xmm4, xmm3
+
+    movdqa         xmm2, xmm0
+    punpcklwd      xmm0, xmm4
+    punpckhwd      xmm4, xmm2
+
+	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+
+    movdqa         xmm7, xmm0
+    paddw          xmm0, xmm5
+    psubw          xmm7, xmm5
+
+	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+
+    movdqa         xmm2, xmm0
+    paddw          xmm0, xmm1
+    psubw          xmm2, xmm1
+
+    WELS_AbsW  xmm0, xmm3
+    paddusw        xmm6, xmm0
+	WELS_AbsW  xmm2, xmm4
+    paddusw        xmm6, xmm2
+    SSE2_SumWHorizon1  xmm6, xmm4
+	movd           retrd,  xmm6
+    and            retrd,  0xffff
+    shr            retrd,  1
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+	 
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    SSE2_GetSatd8x8
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+	 ;push   ebx
+	 ;mov    eax,    [esp+8]
+	 ;mov    ebx,    [esp+12]
+	 ;mov    ecx,    [esp+16]
+	 ;mov    edx,    [esp+20]
+	 
+	 %assign  push_num 0
+	 LOAD_4_PARA
+	 SIGN_EXTENTION r1, r1d
+	 SIGN_EXTENTION r3, r3d	 
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
+     lea    r0,    [r0+2*r1]
+     lea    r2,    [r2+2*r3]
+	 SSE2_GetSatd8x8
+
+	 psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    retrd,   xmm6
+	 LOAD_4_PARA_POP
+	 ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+	
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	push r0
+	push r2	
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+	
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+    ;mov    ecx,    [esp+16]
+    add    r0,    8
+    add    r2,    8
+	SSE2_GetSatd8x8
+
+	psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+	
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	push r0
+	push r2	
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	;mov    eax,    [esp+8]
+	;mov    ecx,    [esp+16]
+	add    r0,    8
+	add    r2,    8
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+	paddd        xmm4, %1 ;for dc
+	paddd        xmm4, %3 ;for dc
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+;    paddd        xmm4, %1 ;for dc
+;	 paddd        xmm4, %3 ;for dc
+	movdqa       %4, %1
+	punpcklqdq   %4, %3
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+	pxor        xmm7,   xmm7
+	movq        xmm0,   [eax]
+	movq        xmm1,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	movq        xmm2,   [eax]
+	movq        xmm3,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	punpcklbw   xmm0,   xmm7
+	punpcklbw   xmm1,   xmm7
+	punpcklbw   xmm2,   xmm7
+	punpcklbw   xmm3,   xmm7
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+	;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2],   0
+	pinsrw      xmm0,   word[esi+%2+8], 4
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+2],  0
+	pinsrw      xmm0,   word[esi+%2+10], 4
+	psubsw      xmm0,   xmm1
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+4],  0
+	pinsrw      xmm0,   word[esi+%2+12], 4
+	psubsw      xmm0,   xmm3
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+6],  0
+	pinsrw      xmm0,   word[esi+%2+14], 4
+	psubsw      xmm0,   xmm2
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH  3
+	movq        xmm0,   [esi+%3+8*%1]
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm5,   xmm0
+	pabsw       xmm1,   xmm1
+	pabsw       xmm2,   xmm2
+	pabsw       xmm3,   xmm3
+	paddw       xmm2,   xmm1;for DC
+	paddw       xmm2,   xmm3;for DC
+	paddw       xmm5,   xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+	pxor        xmm0,   xmm0
+	movq2dq     xmm0,   mm4
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+	shl         %1,     4
+	movdqa      xmm0,   [esi+32+%1]
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 32
+	SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 16
+	SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	pxor        xmm4,   xmm4
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movdqu 		xmm0,   [ecx]
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi],  xmm0 ;V
+	movdqa      [esi+16], xmm1
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     8
+	pinsrb      xmm0,   byte[ecx+edx-1], 9
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     10
+	pinsrb      xmm0,   byte[ecx+edx-1], 11
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     12
+	pinsrb      xmm0,   byte[ecx+edx-1], 13
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     14
+	pinsrb      xmm0,   byte[ecx+edx-1], 15
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi+32], xmm0 ;H
+	movdqa      [esi+48], xmm1
+	movd        ecx,    xmm4 ;dc
+	add         ecx,    16   ;(sum+16)
+	shr         ecx,    5    ;((sum+16)>>5)
+	shl         ecx,    4    ;
+	movd        mm4,    ecx  ; mm4 copy DC
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+	mov         edi,    0
+.loop16x16_get_satd:
+.loopStart1:
+	SSE41_I16x16GetX38x4Satd ecx, edi
+	inc          ecx
+	cmp         ecx, 4
+	jl          .loopStart1
+	cmp         edi, 16
+	je          .loop16x16_get_satd_end
+	mov         eax, [esp+24]
+	add         eax, 8
+	mov         ecx, 0
+	add         edi, 16
+	jmp         .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov      edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ebx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_16x16
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16
+
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_16x16
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+return_satd_intra_16x16_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movq 		xmm0,   [ecx]
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+	movdqa      [esi],  xmm0 ;V
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+	movdqa      [esi+16], xmm0 ;H
+;(sum+2)>>2
+	movdqa      xmm6,   [PDQ2]
+	movdqa      xmm5,   xmm4
+	punpckhqdq  xmm5,   xmm1
+	paddd       xmm5,   xmm6
+	psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+	paddd       xmm6,   xmm6
+	paddd       xmm4,   xmm1
+	paddd       xmm4,   xmm6
+	psrld       xmm4,   3
+;satd *16
+	pslld       xmm5,   4
+	pslld       xmm4,   4
+;temp satd
+	movdqa      xmm6,   xmm4
+	punpcklqdq  xmm4,   xmm5
+	psllq       xmm4,   32
+	psrlq       xmm4,   32
+	movdqa      [esi+32], xmm4
+	punpckhqdq  xmm5,   xmm6
+	psllq       xmm5,   32
+	psrlq       xmm5,   32
+	movdqa      [esi+48], xmm5
+
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+loop_chroma_satdx3_cb_cr:
+	SSE41_ChromaGetX38x4Satd ecx, 0
+	inc             ecx
+	cmp             ecx, 2
+	jl              loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+	movq2dq     %1, %3
+	movq2dq     %2, %4
+	punpcklqdq  %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	xor    edi,    edi
+loop_chroma_satdx3:
+	SSE41_ChromaGetX38x8Satd
+	cmp             edi, 1
+	je              loop_chroma_satdx3end
+	inc             edi
+	SSEReg2MMX  xmm4, mm0,mm1
+	SSEReg2MMX  xmm5, mm2,mm3
+	SSEReg2MMX  xmm6, mm5,mm6
+	mov         ecx,  [esp+44]
+	mov         eax,  [esp+48]
+	jmp         loop_chroma_satdx3
+loop_chroma_satdx3end:
+	MMXReg2SSE  xmm0, xmm3, mm0, mm1
+	MMXReg2SSE  xmm1, xmm3, mm2, mm3
+	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+
+	paddw       xmm4, xmm0
+	paddw       xmm5, xmm1
+	paddw       xmm6, xmm2
+
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov       edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ecx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_8x8
+	cmp        ebx, ecx
+	jge near   not_dc_h_8x8
+
+	; for DC mode
+	mov       dword[edx], 0;I8_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_8x8
+	mov       dword[edx], 1;I8_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+	; for V mode
+	mov       dword[edx], 2;I8_PRED_V
+	mov       eax, ecx
+return_satd_intra_8x8_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
+  movdqa      %1,  xmm6
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
+  psadbw      xmm6,%2
+  paddw       xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    edi,    [esp+40] ;temp_sad
+	sub    ecx,    edx
+    movdqa      xmm5,[ecx]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        eax,xmm0
+
+    add         ecx,edx
+    lea         ebx, [edx+2*edx]
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    sub        edi, 192
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+    lea         esi, [ebx+2*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+; comparing order: DC H V
+	movd        ebx, xmm4 ;DC
+	movd        ecx, xmm3 ;V
+	psrldq      xmm3, 4
+	movd        esi, xmm3 ;H
+	mov         eax, [esp+36] ;lamda
+	shl         eax, 1
+	add         esi, eax
+	add         ebx, eax
+	mov         edx, [esp+32]
+	cmp         ebx, esi
+	jge near   not_dc_16x16_sad
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16_sad
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+    sub        edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm7
+%assign x x+1
+%endrep
+	jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+	; for H mode
+	cmp       esi, ecx
+	jge near   not_dc_h_16x16_sad
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, esi
+	jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+    sub       edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+	pop    edi
+	pop    esi
+	pop    ebx
+	ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+	movq             xmm0, [r0]
+	punpcklqdq       xmm0, xmm0
+	pmaddubsw        xmm0, xmm7
+	movq             xmm1, [r0+r1]
+	punpcklqdq       xmm1, xmm1
+	pmaddubsw        xmm1, xmm7
+	movq             xmm2, [r2]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r2+r3]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	psubsw           xmm0, xmm2
+	psubsw           xmm1, xmm3
+	movq             xmm2, [r0+2*r1]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r0+r4]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	movq             xmm4, [r2+2*r3]
+	punpcklqdq       xmm4, xmm4
+	pmaddubsw        xmm4, xmm7
+	movq             xmm5, [r2+r5]
+	punpcklqdq       xmm5, xmm5
+	pmaddubsw        xmm5, xmm7
+	psubsw           xmm2, xmm4
+	psubsw           xmm3, xmm5
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+	pabsw            xmm0, xmm0
+	pabsw            xmm2, xmm2
+	pabsw            xmm1, xmm1
+	pabsw            xmm3, xmm3
+	movdqa           xmm4, xmm3
+	pblendw          xmm3, xmm1, 0xAA
+	pslld            xmm1, 16
+	psrld            xmm4, 16
+	por              xmm1, xmm4
+	pmaxuw           xmm1, xmm3
+	paddw            xmm6, xmm1
+	movdqa           xmm4, xmm0
+	pblendw          xmm0, xmm2, 0xAA
+	pslld            xmm2, 16
+	psrld            xmm4, 16
+	por              xmm2, xmm4
+	pmaxuw           xmm0, xmm2
+	paddw            xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+	;push        ebx
+	;mov         eax,[esp+8]
+	;mov         ebx,[esp+12]
+	;mov         ecx,[esp+16]
+	;mov         edx,[esp+20]
+	
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[r2+r3*2]
+	lea         r2, [r3*2+r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[r0+r1*2]
+	lea         r0, [r1*2+r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
+	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32	
+	push  r4
+	push  r5
+%endif	
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6, xmm6
+	SSE41_GetSatd8x4
+	lea			r0,	 [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32	
+	push  r4
+	push  r5
+	push  r6
+%endif	
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor        xmm6, xmm6
+	mov         r6,    0
+loop_get_satd_8x16:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_8x16
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;mov    eax,    [esp+16]
+	;mov    ebx,    [esp+20]
+	;mov    ecx,    [esp+24]
+	;mov    edx,    [esp+28]
+%ifdef X86_32	
+	push  r4
+	push  r5
+%endif	
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	push  r0
+	push  r2
+	
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	
+	pop  r2
+	pop  r0
+	;mov			eax,    [esp+16]
+	;mov			ecx,    [esp+24]
+	add			r0,    8
+	add			r2,    8
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+	;push   ebx
+	;push   esi
+	;push   edi
+	;push   ebp
+	;%define pushsize   16
+	;mov    eax,    [esp+pushsize+4]
+	;mov    ebx,    [esp+pushsize+8]
+	;mov    ecx,    [esp+pushsize+12]
+	;mov    edx,    [esp+pushsize+16]
+%ifdef X86_32	
+	push  r4
+	push  r5
+	push  r6
+%endif	
+	%assign  push_num 3
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	
+	push  r0
+	push  r2
+	
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	mov         r6,    0
+loop_get_satd_16x16_left:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_left
+
+	pop  r2
+	pop  r0	
+	;mov			eax,    [esp+pushsize+4]
+	;mov			ecx,    [esp+pushsize+12]
+	add			r0,    8
+	add			r2,    8
+	mov         r6,    0
+loop_get_satd_16x16_right:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_right
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	;%undef pushsize
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqu xmm1,   [r2]
+	MOVDQ  xmm2,   [r0];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	paddw  xmm7,   xmm0
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+2*r3]
+	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+r5]
+	MOVDQ  xmm2,   [r0+r4]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [r0]
+	movq   xmm1,   [r0+r1]
+	lea    r0,     [r0+2*r1]
+	movhps xmm0,   [r0]
+	movhps xmm1,   [r0+r1]
+
+	movq   xmm2,   [r2]
+	movq   xmm3,   [r2+r3]
+	lea    r2,     [r2+2*r3]
+	movhps xmm2,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+	;push ebx
+	;push edi
+	;push esi
+	;%define _STACK_SIZE		12
+	;mov eax, [esp+_STACK_SIZE+4 ]
+	;mov	ebx, [esp+_STACK_SIZE+8 ]	
+	;mov ecx, [esp+_STACK_SIZE+12]
+	;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif	
+
+	%assign  push_num 2
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+	lea r4, [3*r1]
+	lea r5, [3*r3]
+
+	pxor   xmm7,   xmm7
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	movhlps xmm0, xmm7
+	paddw xmm0, xmm7
+	movd retrd, xmm0
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+	
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+
+	movhlps     xmm1, xmm0
+	paddw       xmm0, xmm1
+	movd        retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+	;push   ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+	
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d
+    pxor   xmm6,   xmm6
+
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    ;mov    ecx,    [esp+12]
+	;mov    edx,    ecx
+    ;CACHE_SPLIT_CHECK edx, 8, 64
+	;jle    near   .pixel_sad_8x8_nsplit
+	;push   ebx
+	;push   edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	
+	%assign  push_num 0
+	mov		r2,  arg3
+	push	r2
+	CACHE_SPLIT_CHECK r2, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	pop		r2
+%ifdef X86_32	
+	push	r3
+	push	r4
+	push	r5
+%endif
+	%assign  push_num 3
+	mov		r0,  arg1
+	mov		r1,  arg2	
+	SIGN_EXTENTION r1, r1d
+    pxor   xmm7,   xmm7
+    
+    ;ecx r2, edx r4, edi r5
+
+    mov    r5,    r2
+    and    r5,    0x07
+    sub    r2,    r5
+    mov    r4,    8
+    sub    r4,    r5
+
+    shl    r5,    3
+    shl    r4,    3
+    movd   xmm5,   r5d
+    movd   xmm6,   r4d
+	mov    r5,    8
+	add    r5,    r2
+    mov    r3,    arg4
+	SIGN_EXTENTION r3, r3d
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       retrd,  xmm0
+%ifdef X86_32
+	pop	 r5
+	pop	 r4
+	pop	 r3
+%endif
+	jmp        .return
+	
+.pixel_sad_8x8_nsplit:
+    ;push   ebx
+    ;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    edx,    [esp+20]
+	
+	pop r2
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+.return:
+	ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+	psadbw %1,   %4
+	paddw  xmm5, %1
+	psadbw %4,   %3
+	paddw  xmm4, %4
+	movdqu %4,   [%5-1]
+	psadbw %4,   %2
+	paddw  xmm6, %4
+	movdqu %4,   [%5+1]
+	psadbw %4,   %2
+	paddw  xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+	;push ebx
+	;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    ecx,    [esp+16]
+	;mov    edx,    [esp+20]
+	
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d		
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw  xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw  xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw  xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm2,   xmm3
+	paddw xmm5,   xmm2
+
+	movdqu xmm2,   [r2-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	;mov        ecx,  [esp+24]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+	
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d		
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	movdqu xmm0,   [r2-1]
+	psadbw xmm0,   xmm1
+	paddw xmm6,   xmm0
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm1
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm1,   xmm3
+	paddw xmm5,   xmm1
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+	
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d		
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+	
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d		
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	;mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+	;push ebx
+	;push edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	;mov    edi,    [esp+20]
+	;mov    edx,    [esp+24]
+	
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d		
+	movd   xmm0,   [r0]
+	movd   xmm1,   [r0+r1]
+	lea        r0,    [r0+2*r1]
+	movd       xmm2,   [r0]
+	movd       xmm3,   [r0+r1]
+	punpckldq  xmm0, xmm1
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm0, xmm2
+	sub        r2,  r3
+	movd       xmm1, [r2]
+	movd       xmm2, [r2+r3]
+	punpckldq  xmm1, xmm2
+	movd       xmm2, [r2+r3-1]
+	movd       xmm3, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+
+	movd       xmm4, [r2]
+	movd       xmm5, [r2-1]
+	punpckldq  xmm2, xmm5
+	movd       xmm5, [r2+1]
+	punpckldq  xmm3, xmm5
+
+	movd       xmm5, [r2+r3]
+	punpckldq  xmm4, xmm5
+
+	punpcklqdq xmm1, xmm4 ;-L
+
+	movd       xmm5, [r2+r3-1]
+	movd       xmm6, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+	movd       xmm7, [r2-1]
+	punpckldq  xmm5, xmm7
+	punpcklqdq xmm2, xmm5 ;-1
+	movd       xmm7, [r2+1]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm3, xmm6 ;+1
+	movd       xmm6, [r2]
+	movd       xmm7, [r2+r3]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6 ;+L
+	psadbw     xmm1, xmm0
+	psadbw     xmm2, xmm0
+	psadbw     xmm3, xmm0
+	psadbw     xmm4, xmm0
+
+	movhlps    xmm0, xmm1
+	paddw      xmm1, xmm0
+	movhlps    xmm0, xmm2
+	paddw      xmm2, xmm0
+	movhlps    xmm0, xmm3
+	paddw      xmm3, xmm0
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	;mov        edi,  [esp+28]
+	punpckldq  xmm1, xmm4
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm1, xmm2
+	movdqa     [r4],xmm1
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+    ;push    ebx
+	;%define pushsize     4
+	;%define pix1address	 esp+pushsize+4
+	;%define pix1stride   esp+pushsize+8
+	;%define pix2address  esp+pushsize+12
+	;%define pix2stride   esp+pushsize+16
+    ;mov		  eax, [pix1address]
+    ;mov		  ebx, [pix1stride ]
+    ;mov		  ecx, [pix2address]
+    ;mov		  edx, [pix2stride ]
+    
+    %assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	movd	  mm0, [r0]
+	movd	  mm1, [r0+r1]
+	punpckldq mm0, mm1
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm0, mm3
+
+	lea       r0, [r0+2*r1]
+	lea       r2, [r2+2*r3]
+
+	movd      mm1, [r0]
+	movd      mm2, [r0+r1]
+	punpckldq mm1, mm2
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm1, mm3
+	paddw     mm0, mm1
+
+    movd      retrd, mm0
+
+	WELSEMMS
+    LOAD_4_PARA_POP
+    ret
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -42,8 +42,6 @@
 
 %include "asm_inc.asm"
 
-bits 32
-
 ;***********************************************************************
 ; Macros
 ;***********************************************************************
@@ -171,25 +169,34 @@
 ALIGN 16
 WELS_EXTERN WelsScan4x4DcAc_sse2
 WelsScan4x4DcAc_sse2:
-
-	mov        eax, [esp+8]
-	movdqa     xmm0, [eax]			; 7 6 5 4 3 2 1 0
-	movdqa     xmm1, [eax+16]		; f e d c b a 9 8
-	pextrw     ecx, xmm0, 7			; ecx = 7
-	pextrw     edx, xmm1, 2			; edx = a
-	pextrw     eax, xmm0, 5			; eax = 5
-	pinsrw     xmm1, ecx, 2			; f e d c b 7 9 8
-	pinsrw     xmm0, eax, 7			; 5 6 5 4 3 2 1 0
-	pextrw     ecx, xmm1, 0			; ecx = 8
-	pinsrw     xmm0, ecx, 5			; 5 6 8 4 3 2 1 0
-	pinsrw     xmm1, edx, 0			; f e d c b 7 9 a
+	%ifdef X86_32
+	push r3
+	%assign push_num 1
+	%else
+	%assign push_num 0
+	%endif
+	LOAD_2_PARA
+	;mov        eax, [esp+8]
+	movdqa     xmm0, [r1]			; 7 6 5 4 3 2 1 0
+	movdqa     xmm1, [r1+16]		; f e d c b a 9 8
+	pextrw     r2d, xmm0, 7			; ecx = 7
+	pextrw     r3d, xmm1, 2			; edx = a
+	pextrw     r1d, xmm0, 5			; eax = 5
+	pinsrw     xmm1, r2d, 2			; f e d c b 7 9 8
+	pinsrw     xmm0, r1d, 7			; 5 6 5 4 3 2 1 0
+	pextrw     r2d, xmm1, 0			; ecx = 8
+	pinsrw     xmm0, r2d, 5			; 5 6 8 4 3 2 1 0
+	pinsrw     xmm1, r3d, 0			; f e d c b 7 9 a
 	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
 	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
 	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
 	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
-	mov        eax,  [esp+4]
-	movdqa     [eax],xmm0
-	movdqa     [eax+16], xmm1
+	;mov        eax,  [esp+4]
+	movdqa     [r0],xmm0
+	movdqa     [r0+16], xmm1
+	%ifdef X86_32
+	pop r3
+	%endif
 	ret
 
 ;***********************************************************************
@@ -198,19 +205,21 @@
 ALIGN 16
 WELS_EXTERN WelsScan4x4DcAc_ssse3
 WelsScan4x4DcAc_ssse3:
-	mov        eax, [esp+8]
-	movdqa     xmm0, [eax]
-	movdqa     xmm1, [eax+16]
-	pextrw		ecx,  xmm0, 7			; ecx = [7]
-	pextrw		eax,  xmm1, 0			; eax = [8]
-	pinsrw		xmm0, eax, 7			; xmm0[7]	=	[8]
-	pinsrw		xmm1, ecx, 0			; xmm1[0]	=	[7]
+	%assign push_num 0
+	LOAD_2_PARA
+	;mov        eax, [esp+8]
+	movdqa     xmm0, [r1]
+	movdqa     xmm1, [r1+16]
+	pextrw		r2d,  xmm0, 7			; ecx = [7]
+	pextrw		r1d,  xmm1, 0			; eax = [8]
+	pinsrw		xmm0, r1d, 7			; xmm0[7]	=	[8]
+	pinsrw		xmm1, r2d, 0			; xmm1[0]	=	[7]
 	pshufb		xmm1, [pb_scanacdc_maskb]
 	pshufb		xmm0, [pb_scanacdc_maska]
 
-	mov        eax,  [esp+4]
-	movdqa     [eax],xmm0
-	movdqa     [eax+16], xmm1
+	;mov        eax,  [esp+4]
+	movdqa     [r0],xmm0
+	movdqa     [r0+16], xmm1
 	ret
 ;***********************************************************************
 ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
@@ -218,9 +227,11 @@
 ALIGN 16
 WELS_EXTERN WelsScan4x4Ac_sse2
 WelsScan4x4Ac_sse2:
-	mov        eax, [esp+8]
-	movdqa     xmm0, [eax]
-	movdqa     xmm1, [eax+16]
+	%assign push_num 0
+	LOAD_2_PARA
+	;mov        eax, [esp+8]
+	movdqa     xmm0, [r1]
+	movdqa     xmm1, [r1+16]
 	movdqa     xmm2, xmm0
 	punpcklqdq xmm0, xmm1
 	punpckhqdq xmm2, xmm1
@@ -228,14 +239,14 @@
 	movdqa     xmm3, xmm0
 	punpckldq  xmm0, xmm2
 	punpckhdq  xmm3, xmm2
-	pextrw     eax , xmm0, 3
-	pextrw     edx , xmm0, 7
-	pinsrw     xmm0, eax,  7
-	pextrw     eax,  xmm3, 4
-	pinsrw     xmm3, edx,  4
-	pextrw     edx,  xmm3, 0
-	pinsrw     xmm3, eax,  0
-	pinsrw     xmm0, edx,  3
+	pextrw     r1d , xmm0, 3
+	pextrw     r2d , xmm0, 7
+	pinsrw     xmm0, r1d,  7
+	pextrw     r1d,  xmm3, 4
+	pinsrw     xmm3, r2d,  4
+	pextrw     r2d,  xmm3, 0
+	pinsrw     xmm3, r1d,  0
+	pinsrw     xmm0, r2d,  3
 
 	pshufhw    xmm1, xmm0, 0x93
 	pshuflw    xmm2, xmm3, 0x39
@@ -245,9 +256,9 @@
     pslldq     xmm3, 14
     por        xmm1, xmm3
     psrldq     xmm2, 2
-	mov        eax,  [esp+4]
-	movdqa     [eax],xmm1
-	movdqa     [eax+16], xmm2
+	;mov        eax,  [esp+4]
+	movdqa     [r0],xmm1
+	movdqa     [r0+16], xmm2
 	ret
 
 
@@ -257,44 +268,60 @@
 ALIGN 16
 WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
 WelsCalculateSingleCtr4x4_sse2:
-	push      ebx
-	mov       eax,  [esp+8]
-	movdqa    xmm0, [eax]
-	movdqa    xmm1, [eax+16]
+	;push      ebx
+	;mov       eax,  [esp+8]
+	%ifdef X86_32
+	push r3
+	%assign push_num 1
+	%else
+	%assign push_num 0
+	%endif
+	LOAD_1_PARA
+	movdqa    xmm0, [r0]
+	movdqa    xmm1, [r0+16]
 
 	packsswb  xmm0, xmm1
-
+	; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+	xor r3, r3
     pxor      xmm3, xmm3
     pcmpeqb   xmm0, xmm3
-    pmovmskb  edx,  xmm0
+    pmovmskb  r3d,  xmm0
 
-    xor       edx,  0xffff
+    xor       r3,  0xffff
 
-	xor       eax,  eax
-	mov       ecx,  7
-	mov       ebx,  8
+	xor       r0,  r0
+	mov       r2,  7
+	mov       r1,  8
 .loop_low8_find1:
-	bt        edx,  ecx
+	bt        r3,  r2
 	jc        .loop_high8_find1
-	loop      .loop_low8_find1
+	dec		  r2
+	jnz      .loop_low8_find1
 .loop_high8_find1:
-	bt        edx, ebx
+	bt        r3, r1
 	jc        .find1end
-	inc       ebx
-	cmp       ebx,16
+	inc       r1
+	cmp       r1,16
 	jb        .loop_high8_find1
 .find1end:
-	sub       ebx, ecx
-	sub       ebx, 1
-	add       al,  [i_ds_table+ebx]
-	mov       ebx, edx
-	and       edx, 0xff
-	shr       ebx, 8
-	and       ebx, 0xff
-	add       al,  [low_mask_table +edx]
-	add       al,  [high_mask_table+ebx]
-
-	pop       ebx
+	sub       r1, r2
+	sub       r1, 1
+	lea	  r2,  [i_ds_table]
+	add       r0b,  [r2+r1]
+	mov       r1, r3
+	and       r3, 0xff
+	shr       r1, 8
+	and       r1, 0xff
+	lea	  r2 , [low_mask_table]
+	add       r0b,  [r2 +r3]
+	lea	  r2, [high_mask_table]
+	add       r0b,  [r2+r1]
+	%ifdef X86_32
+	pop r3
+	%else
+	mov retrd, r0d
+	%endif
+	;pop       ebx
 	ret
 
 
@@ -304,21 +331,29 @@
 ALIGN 16
 WELS_EXTERN WelsGetNoneZeroCount_sse2
 WelsGetNoneZeroCount_sse2:
-	mov       eax,  [esp+4]
-	movdqa    xmm0, [eax]
-	movdqa    xmm1, [eax+16]
+	%assign push_num 0
+	LOAD_1_PARA
+	;mov       eax,  [esp+4]
+	movdqa    xmm0, [r0]
+	movdqa    xmm1, [r0+16]
 	pxor      xmm2, xmm2
 	pcmpeqw   xmm0, xmm2
 	pcmpeqw   xmm1, xmm2
 	packsswb  xmm1, xmm0
-	pmovmskb  edx,  xmm1
-	xor       edx,  0xffff
-	mov       ecx,  edx
-	and       edx,  0xff
-	shr       ecx,  8
+	xor r1, r1
+	pmovmskb  r1d,  xmm1
+	xor       r1d,  0xffff
+	mov       r2,  r1
+	and       r1,  0xff
+	shr       r2,  8
 ;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-	xor       eax,  eax
-	add       al,  [nozero_count_table+ecx]
-	add       al,  [nozero_count_table+edx]
+;	xor       retr,  retr
+	;add       al,  [nozero_count_table+r2]
+	lea 	  r0 , [nozero_count_table]
+	movzx	  r2, byte [r0+r2]
+	movzx	  r1,   byte [r0+r1]
+	mov	  retrq, r2
+	add	  retrq, r1
+	;add       al,  [nozero_count_table+r1]
 	ret
 
--- a/codec/encoder/core/asm/vaa.asm
+++ /dev/null
@@ -1,403 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	vaa.asm
-;*
-;*	Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010	Created
-;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
-;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
-
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $4
-%endmacro
-
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-;	dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-; , 6/7/2010
-
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; iLineSize
-
-	mov ebx, ecx
-	sal ebx, $1			; iLineSize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; iLineSize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; iLineSize x 4 [eax]
-
-	pxor xmm7, xmm7
-
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+24], xmm0
-
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low word truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; iLineSize
-
-	mov ebx, ecx
-	sal ebx, $1			; iLineSize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; iLineSize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; iLineSize x 4 [eax]
-
-	pxor xmm7, xmm7
-
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+24], xmm1
-
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low work truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN MdInterAnalysisVaaInfo_sse41
-;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
-;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse41:
-	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
-	pshufd xmm4, xmm3, 01Bh
-	paddd xmm4, xmm3
-	pshufd xmm3, xmm4, 0B1h
-	paddd xmm3, xmm4
-	movd eax, xmm3
-	cmp eax, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 0B1h
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps eax, xmm0
-	ret
-.threshold_exit:
-	mov eax, 15
-	ret
-
-WELS_EXTERN MdInterAnalysisVaaInfo_sse2
-;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
-;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse2:
-	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
-
-	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3
-	pmuludq xmm2, xmm3
-	pshufd xmm4, xmm3, 0B1h
-	pmuludq xmm4, xmm4
-	movdqa xmm5, xmm2
-	punpckldq xmm5, xmm4
-	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2
-
-	pshufd xmm4, xmm5, 01Bh
-	paddd xmm4, xmm5
-	pshufd xmm5, xmm4, 0B1h
-	paddd xmm5, xmm4
-	movd eax, xmm5
-	cmp eax, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 0B1h
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps eax, xmm0
-	ret
-.threshold_exit:
-	mov eax, 15
-	ret
--- a/codec/encoder/core/inc/mc.h
+++ b/codec/encoder/core/inc/mc.h
@@ -61,15 +61,15 @@
 void McCopyWidthEq8_mmx (uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 void PixelAvgWidthEq8_mmx (uint8_t*,  int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 
-void McHorVer20_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer20Width9Or17_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
                       int32_t iHeight);
-void McHorVer02_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer02Height9Or17_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
                       int32_t iHeight);
 void McHorVer22HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
                               int32_t iHeight);
-void McHorVer22VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer22Width8VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
                                   int32_t iHeight);
-void McHorVer22VerLastUnAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer22Width8VerLastUnAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
                                     int32_t iWidth, int32_t iHeight);
 void McChromaWidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, const uint8_t* kpABCD,
                             int32_t iHeigh);
@@ -80,8 +80,6 @@
                                     int32_t iHeight);
 void PixelAvgWidthEq16_sse2 (uint8_t*,  int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 
-
-void PixelAvgWidthEq16_ssse3 (uint8_t*,  int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
 void McChromaWidthEq8_ssse3 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
                              const uint8_t* kpABCD, int32_t iHeigh);
 
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -107,9 +107,6 @@
 int32_t WelsIntra16x16Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
 int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
     uint8_t*, uint8_t*);
-int32_t WelsIntraChroma8x8Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
-    uint8_t*, uint8_t*);
-
 
 #endif//X86_ASM
 
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -994,15 +994,15 @@
 
 
 #ifdef X86_ASM
-  if (iCpu & WELS_CPU_SSE2) {
+  if (iCpu & WELS_CPU_SSE2) {	  
     pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
     pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
-    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
-    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
+    pFunc->pfLumaDeblockingLT4Hor       = DeblockLumaLt4H_sse2;
+    pFunc->pfLumaDeblockingEQ4Hor       = DeblockLumaEq4H_sse2;
     pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
     pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
     pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
-    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2; 
   }
 #endif
 }
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -261,15 +261,15 @@
 
 #if defined(X86_ASM)
   if (uiCpuFlag & WELS_CPU_MMXEXT) {
-    pFuncList->pfIDctT4		= WelsIDctT4Rec_mmx;
+  //  pFuncList->pfIDctT4		= WelsIDctT4Rec_mmx;
   }
   if (uiCpuFlag & WELS_CPU_SSE2) {
-    pFuncList->pfDequantization4x4			= WelsDequant4x4_sse2;
+   /* pFuncList->pfDequantization4x4			= WelsDequant4x4_sse2;
     pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_sse2;
     pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_sse2;
 
     pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_sse2;
-    pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
+    pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;*/
   }
 #endif//X86_ASM
 }
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -569,7 +569,7 @@
   }
 //#ifndef MACOS
   if (uiCpuFlag & WELS_CPU_SSSE3) {
-    pFuncList->pfScan4x4				= WelsScan4x4DcAc_ssse3;
+  //  pFuncList->pfScan4x4				= WelsScan4x4DcAc_ssse3;
   }
 
 //#endif//MACOS
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -1919,7 +1919,7 @@
            uiCpuCores,
            iCacheLineSize);
 
-#ifdef _DEBUG	// output at console & _debug
+//#ifdef _DEBUG	// output at console & _debug
   fprintf (stderr, "WELS CPU features/capacities (0x%x) detected: \n"	\
            "HTT:      %c, "	\
            "MMX:      %c, "	\
@@ -1962,7 +1962,7 @@
            (uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
            uiCpuCores,
            iCacheLineSize);
-#endif//_DEBUG
+//#endif//_DEBUG
 }
 
 /*!
--- a/codec/encoder/core/src/expand_pic.cpp
+++ b/codec/encoder/core/src/expand_pic.cpp
@@ -29,14 +29,12 @@
  *     POSSIBILITY OF SUCH DAMAGE.
  *
  */
-
 #include <string.h>
 #include "expand_pic.h"
 #include "cpu_core.h"
 #include "wels_func_ptr_def.h"
 
-namespace WelsSVCEnc {
-
+namespace WelsSVCEnc{
 // rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
 static inline void ExpandPictureLuma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
                                         const int32_t kiPicH) {
@@ -144,6 +142,8 @@
   const int32_t kiWidthUV	= kiWidthY >> 1;
   const int32_t kiHeightUV	= kiHeightY >> 1;
 
+
+
   pExpLuma (pPicY, pPic->iLineSize[0], kiWidthY, kiHeightY);
   if (kiWidthUV >= 16) {
     // fix coding picture size as 16x16
@@ -155,6 +155,7 @@
     ExpandPictureChroma_c (pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
     ExpandPictureChroma_c (pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
   }
+
 }
 
 }
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -80,13 +80,13 @@
 
 #if defined(X86_ASM)
   if (kuiCpuFlag & WELS_CPU_MMXEXT) {
-    WelsFillingPred8to16		= WelsFillingPred8to16_mmx;
-    WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_mmx;
-    WelsFillingPred1to16		= WelsFillingPred1to16_mmx;
+  //  WelsFillingPred8to16		= WelsFillingPred8to16_mmx;
+  //  WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_mmx;
+  //  WelsFillingPred1to16		= WelsFillingPred1to16_mmx;
   }
   if (kuiCpuFlag & WELS_CPU_SSE2) {
-    WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_sse2;
-    WelsFillingPred1to16		= WelsFillingPred1to16_sse2;
+   // WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_sse2;
+   // WelsFillingPred1to16		= WelsFillingPred1to16_sse2;
   }
 #endif//X86_ASM
 }
--- a/codec/encoder/core/src/mc.cpp
+++ b/codec/encoder/core/src/mc.cpp
@@ -426,7 +426,7 @@
     int32_t iHeight) {
   ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
   McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
-  McHorVer22VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
 }
 
 //2010.2.5
@@ -441,13 +441,13 @@
   McHorVer22WidthEq8_sse2 (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
   McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
 }
-void McHorVer22_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer22Width9Or17Height9Or17_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
                       int32_t iHeight) {
   ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
   int32_t tmp1 = 2 * (iWidth - 8);
   McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
-  McHorVer22VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
-  McHorVer22VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+  McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
+  McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8, iDstStride, 8, iHeight);
 }
 
 typedef void (*McChromaWidthEqx) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
@@ -523,9 +523,9 @@
   pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
 #if defined (X86_ASM)
   if (uiCpuFlag & WELS_CPU_SSE2) {
-    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_sse2;
-    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_sse2;
-    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
+    pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
     pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
     pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
     pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
@@ -541,7 +541,6 @@
 
   if (uiCpuFlag & WELS_CPU_SSSE3) {
     pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
-    pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_ssse3;
   }
 
 #endif //(X86_ASM)
--- a/codec/encoder/core/src/md.cpp
+++ b/codec/encoder/core/src/md.cpp
@@ -439,7 +439,7 @@
   return (uiMbSign);
 }
 
-static inline int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) {
+int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) {
   ENFORCE_STACK_ALIGN_1D (uint16_t, uiAvgBlock, 16, 16)
   uint16_t* pBlock = &uiAvgBlock[0];
   uint8_t* pEncData	= pDataY;
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -465,11 +465,11 @@
     pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
     pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
     pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;
-    pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd =  WelsSmpleSatdThree4x4_sse2;
+    //pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd =  WelsSmpleSatdThree4x4_sse2;
   }
 
   if (uiCpuFlag & WELS_CPU_SSSE3) {
-    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
+    //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
   }
 
   if (uiCpuFlag & WELS_CPU_SSE41) {
@@ -478,8 +478,8 @@
     pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
     pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
     pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
-    pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
-    pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
+    //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
+    //pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
   }
 
 #endif //(X86_ASM)
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -207,7 +207,7 @@
 
 #if defined(X86_ASM)
   if (uiCpuFlag & WELS_CPU_SSE2) {
-    sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
+   // sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
   }
 #endif
 }
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -38,22 +38,13 @@
 ENCODER_OBJS += $(ENCODER_CPP_SRCS:.cpp=.o)
 ifeq ($(USE_ASM), Yes)
 ENCODER_ASM_SRCS=\
-	$(ENCODER_SRCDIR)/./core/asm/asm_inc.asm\
 	$(ENCODER_SRCDIR)/./core/asm/coeff.asm\
-	$(ENCODER_SRCDIR)/./core/asm/cpuid.asm\
 	$(ENCODER_SRCDIR)/./core/asm/dct.asm\
-	$(ENCODER_SRCDIR)/./core/asm/deblock.asm\
-	$(ENCODER_SRCDIR)/./core/asm/expand_picture.asm\
 	$(ENCODER_SRCDIR)/./core/asm/intra_pred.asm\
-	$(ENCODER_SRCDIR)/./core/asm/intra_pred_util.asm\
-	$(ENCODER_SRCDIR)/./core/asm/mb_copy.asm\
-	$(ENCODER_SRCDIR)/./core/asm/mc_chroma.asm\
-	$(ENCODER_SRCDIR)/./core/asm/mc_luma.asm\
 	$(ENCODER_SRCDIR)/./core/asm/memzero.asm\
 	$(ENCODER_SRCDIR)/./core/asm/quant.asm\
 	$(ENCODER_SRCDIR)/./core/asm/satd_sad.asm\
 	$(ENCODER_SRCDIR)/./core/asm/score.asm\
-	$(ENCODER_SRCDIR)/./core/asm/vaa.asm\
 
 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
 endif
@@ -158,39 +149,15 @@
 $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.o: $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.cpp
 	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c -o $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.o $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.cpp
 
-$(ENCODER_SRCDIR)/./core/asm/asm_inc.o: $(ENCODER_SRCDIR)/./core/asm/asm_inc.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/asm_inc.o $(ENCODER_SRCDIR)/./core/asm/asm_inc.asm
-
 $(ENCODER_SRCDIR)/./core/asm/coeff.o: $(ENCODER_SRCDIR)/./core/asm/coeff.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/coeff.o $(ENCODER_SRCDIR)/./core/asm/coeff.asm
 
-$(ENCODER_SRCDIR)/./core/asm/cpuid.o: $(ENCODER_SRCDIR)/./core/asm/cpuid.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/cpuid.o $(ENCODER_SRCDIR)/./core/asm/cpuid.asm
-
 $(ENCODER_SRCDIR)/./core/asm/dct.o: $(ENCODER_SRCDIR)/./core/asm/dct.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/dct.o $(ENCODER_SRCDIR)/./core/asm/dct.asm
 
-$(ENCODER_SRCDIR)/./core/asm/deblock.o: $(ENCODER_SRCDIR)/./core/asm/deblock.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/deblock.o $(ENCODER_SRCDIR)/./core/asm/deblock.asm
-
-$(ENCODER_SRCDIR)/./core/asm/expand_picture.o: $(ENCODER_SRCDIR)/./core/asm/expand_picture.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/expand_picture.o $(ENCODER_SRCDIR)/./core/asm/expand_picture.asm
-
 $(ENCODER_SRCDIR)/./core/asm/intra_pred.o: $(ENCODER_SRCDIR)/./core/asm/intra_pred.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/intra_pred.o $(ENCODER_SRCDIR)/./core/asm/intra_pred.asm
 
-$(ENCODER_SRCDIR)/./core/asm/intra_pred_util.o: $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.o $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.asm
-
-$(ENCODER_SRCDIR)/./core/asm/mb_copy.o: $(ENCODER_SRCDIR)/./core/asm/mb_copy.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/mb_copy.o $(ENCODER_SRCDIR)/./core/asm/mb_copy.asm
-
-$(ENCODER_SRCDIR)/./core/asm/mc_chroma.o: $(ENCODER_SRCDIR)/./core/asm/mc_chroma.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/mc_chroma.o $(ENCODER_SRCDIR)/./core/asm/mc_chroma.asm
-
-$(ENCODER_SRCDIR)/./core/asm/mc_luma.o: $(ENCODER_SRCDIR)/./core/asm/mc_luma.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/mc_luma.o $(ENCODER_SRCDIR)/./core/asm/mc_luma.asm
-
 $(ENCODER_SRCDIR)/./core/asm/memzero.o: $(ENCODER_SRCDIR)/./core/asm/memzero.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/memzero.o $(ENCODER_SRCDIR)/./core/asm/memzero.asm
 
@@ -202,9 +169,6 @@
 
 $(ENCODER_SRCDIR)/./core/asm/score.o: $(ENCODER_SRCDIR)/./core/asm/score.asm
 	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/score.o $(ENCODER_SRCDIR)/./core/asm/score.asm
-
-$(ENCODER_SRCDIR)/./core/asm/vaa.o: $(ENCODER_SRCDIR)/./core/asm/vaa.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/vaa.o $(ENCODER_SRCDIR)/./core/asm/vaa.asm
 
 $(LIBPREFIX)encoder.$(LIBSUFFIX): $(ENCODER_OBJS)
 	rm -f $(LIBPREFIX)encoder.$(LIBSUFFIX)
--- /dev/null
+++ b/codec/processing/build/linux/makefile
@@ -1,0 +1,94 @@
+NASM = 1
+NAME      = libwelsvp
+
+OUTDIR    = ../../../bin/linux
+BINDIR    = ../../bin
+OBJDIR    = ../../obj
+SRCDIRS   = ../../src/asm \
+            ../../src/common \
+            ../../src/adaptivequantization \
+            ../../src/backgounddetection \
+            ../../src/denoise \
+            ../../src/downsample \
+            ../../src/scenechangedetection \
+            ../../src/vaacalc \
+            ../../src/complexityanalysis
+SRCDIRS  += ../../src/imagerotate
+
+
+TARGETLIB =  $(BINDIR)/$(NAME).so
+
+CC        = $(shell which gcc)
+AS        = $(shell which nasm)
+GCC       = gcc -m32
+
+CPPFLAGS  = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
+LDFLAGS   = -lstdc++ -ldl
+
+SRCEXTS  = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS  = .h
+SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP  = $(filter %.cpp,$(SOURCES))
+SRC_ASM  = $(filter %.asm,$(SOURCES))
+OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS     = $(OBJS:.o=.d)
+
+DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+                  echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm   = $(AS)  $(ASMFLAGS)
+LINK          = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+	$(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+	$(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+	etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+	ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+	@echo produce the lib to $(TARGETLIB).
+	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+	@cp -f $(TARGETLIB) $(OUTDIR)
+	@cp -f $(TARGETLIB) ../../../testbin
+	@echo copy the lib to $(OUTDIR).
+
+clean:
+	rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+	rm -f $(DEPS) TAGS
+
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2008.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
binary files /dev/null b/codec/processing/build/win32/WelsVP_2008.suo differ
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -1,0 +1,846 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsVP"
+	ProjectGUID="{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+	RootNamespace="WelsVP"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+		<Platform
+			Name="x64"
+		/>
+	</Platforms>
+	<ToolFiles>
+		<DefaultToolFile
+			FileName="masm.rules"
+		/>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\vp\Debug"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="0"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				UsePrecompiledHeader="0"
+				AssemblerListingLocation=""
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkLibraryDependencies="true"
+				OutputFile="$(OutDir)\welsvp.dll"
+				LinkIncremental="2"
+				ModuleDefinitionFile="../../src/common/WelsVP.def"
+				GenerateDebugInformation="true"
+				GenerateMapFile="true"
+				MapFileName="$(OutDir)\welsvp.map"
+				SubSystem="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory=".\..\..\..\..\bin\win64\Debug"
+			IntermediateDirectory=".\..\..\..\obj\vp\Debug"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="0"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				UsePrecompiledHeader="0"
+				AssemblerListingLocation=""
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkLibraryDependencies="true"
+				OutputFile="$(OutDir)\welsvp.dll"
+				LinkIncremental="2"
+				ModuleDefinitionFile="../../src/common/WelsVP.def"
+				GenerateDebugInformation="true"
+				GenerateMapFile="true"
+				MapFileName="$(OutDir)\welsvp.map"
+				SubSystem="2"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\vp\Release"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				EnableIntrinsicFunctions="false"
+				FavorSizeOrSpeed="1"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="false"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="0"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)\welsvp.dll"
+				LinkIncremental="1"
+				GenerateManifest="false"
+				EnableUAC="false"
+				ModuleDefinitionFile="../../src/common/WelsVP.def"
+				GenerateDebugInformation="false"
+				GenerateMapFile="false"
+				MapFileName=""
+				MapExports="false"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory=".\..\..\..\..\bin\win64\Release"
+			IntermediateDirectory=".\..\..\..\obj\vp\Release"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				EnableIntrinsicFunctions="false"
+				FavorSizeOrSpeed="1"
+				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="false"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="0"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)\welsvp.dll"
+				LinkIncremental="1"
+				GenerateManifest="false"
+				EnableUAC="false"
+				ModuleDefinitionFile="../../src/common/WelsVP.def"
+				GenerateDebugInformation="false"
+				GenerateMapFile="false"
+				MapFileName=""
+				MapExports="false"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\..\src\common\cpu.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\memory.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\thread.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\util.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsFrameWork.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsFrameWorkEx.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Interface"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\..\interface\IWelsVP.h"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\common\resource.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+			<File
+				RelativePath="..\..\src\common\WelsVP.def"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsVP.rc"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			>
+			<File
+				RelativePath="..\..\src\common\cpu.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\memory.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\thread.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\typedef.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\util.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\version.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsFrameWork.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="ASM"
+			>
+			<File
+				RelativePath="..\..\..\common\cpuid.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\denoisefilter.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\downsample_bilinear.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\intra_pred.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\sad.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\vaa.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="SceneChangeDetection"
+			>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Denoise"
+			>
+			<File
+				RelativePath="..\..\src\denoise\denoise.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\denoise\denoise.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\denoise\denoise_filter.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="VAACalc"
+			>
+			<File
+				RelativePath="..\..\src\vaacalc\vaacalcfuncs.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\vaacalc\vaacalculation.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\vaacalc\vaacalculation.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="BackgroundDetection"
+			>
+			<File
+				RelativePath="..\..\src\backgounddetection\BackgroundDetection.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\backgounddetection\BackgroundDetection.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="AdaptiveQuantization"
+			>
+			<File
+				RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Downsample"
+			>
+			<File
+				RelativePath="..\..\src\downsample\downsample.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\downsample\downsample.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\downsample\downsamplefuncs.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="ComplexityAnalysis"
+			>
+			<File
+				RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="ImageRotate"
+			>
+			<File
+				RelativePath="..\..\src\imagerotate\imagerotate.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\imagerotate\imagerotate.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\imagerotate\imagerotatefuncs.cpp"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2010.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
binary files /dev/null b/codec/processing/build/win32/WelsVP_2010.suo differ
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2010.vcxproj
@@ -1,0 +1,386 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
+    <RootNamespace>WelsVP</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\..\..\bin\win32\Debug\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\..\..\bin\win64\Debug\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\..\..\obj\vp\Debug\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\..\..\obj\vp\Debug\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\..\..\bin\win32\Release\</OutDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\..\..\bin\win64\Release\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\..\..\obj\vp\Release\</IntDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\..\..\obj\vp\Release\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+    <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</GenerateManifest>
+    <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</GenerateManifest>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsvp</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">welsvp</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsvp</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">welsvp</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+    <Link>
+      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <SubSystem>Windows</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN64;_DEBUG;X86_ASM;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+    <Link>
+      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <SubSystem>Windows</SubSystem>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>
+      </DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+      <EnableUAC>false</EnableUAC>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <MapExports>true</MapExports>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>WIN64;NDEBUG;X86_ASM;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>
+      </DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+      <EnableUAC>false</EnableUAC>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <MapExports>true</MapExports>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\common\cpu.cpp" />
+    <ClCompile Include="..\..\src\common\memory.cpp" />
+    <ClCompile Include="..\..\src\common\thread.cpp" />
+    <ClCompile Include="..\..\src\common\util.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
+    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
+    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsample.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
+    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\interface\IWelsVP.h" />
+    <ClInclude Include="..\..\src\common\resource.h" />
+    <ClInclude Include="..\..\src\common\cpu.h" />
+    <ClInclude Include="..\..\src\common\memory.h" />
+    <ClInclude Include="..\..\src\common\thread.h" />
+    <ClInclude Include="..\..\src\common\typedef.h" />
+    <ClInclude Include="..\..\src\common\util.h" />
+    <ClInclude Include="..\..\src\common\version.h" />
+    <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
+    <ClInclude Include="..\..\src\denoise\denoise.h" />
+    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
+    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
+    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
+    <ClInclude Include="..\..\src\downsample\downsample.h" />
+    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
+    <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\src\common\WelsVP.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\vaa.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\common\cpuid.asm">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\  -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2010.vcxproj.filters
@@ -1,0 +1,162 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\interface\IWelsVP.h">
+      <Filter>headers</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\util.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\cpu.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\denoise\denoise.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\downsample\downsample.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\memory.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\thread.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\cpu.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\denoise\denoise.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\downsample\downsample.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\memory.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\resource.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\thread.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\typedef.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\util.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\version.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\WelsFrameWork.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\sad.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\vaa.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\common\cpuid.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="ASM">
+      <UniqueIdentifier>{ecef07b7-65e1-45c4-9afc-39f7b07992a2}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="headers">
+      <UniqueIdentifier>{be24742a-75fa-49a4-b77e-a69d626d46c8}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="sources">
+      <UniqueIdentifier>{9f4c2bd3-e8d2-4276-adc6-273c0031971a}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="resources">
+      <UniqueIdentifier>{322f1cbe-435f-402b-8d86-71d023d5d407}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\src\common\WelsVP.def">
+      <Filter>resources</Filter>
+    </None>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\src\common\WelsVP.rc">
+      <Filter>resources</Filter>
+    </ResourceCompile>
+  </ItemGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2012.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2012", "WelsVP_2012.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
binary files /dev/null b/codec/processing/build/win32/WelsVP_2012.v11.suo differ
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2012.vcxproj
@@ -1,0 +1,427 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
+    <RootNamespace>WelsVP</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\bin\win32\Debug\</OutDir>
+    <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>welsvp</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>welsvp</TargetName>
+    <OutDir>.\..\..\..\bin\win64\Debug\</OutDir>
+    <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\bin\win32\Release\</OutDir>
+    <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <GenerateManifest>false</GenerateManifest>
+    <TargetName>welsvp</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <GenerateManifest>false</GenerateManifest>
+    <TargetName>welsvp</TargetName>
+    <OutDir>.\..\..\..\bin\win64\Release\</OutDir>
+    <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <CustomBuildStep>
+      <Command />
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader />
+      <AssemblerListingLocation />
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+    <Link>
+      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <SubSystem>Windows</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+    <Link>
+      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <SubSystem>Windows</SubSystem>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+    <CustomBuildStep>
+      <Command />
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader />
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat />
+    </ClCompile>
+    <Link>
+      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+      <EnableUAC>false</EnableUAC>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <MapExports>true</MapExports>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>
+      </DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+      <EnableUAC>false</EnableUAC>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <MapExports>true</MapExports>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\common\cpu.cpp" />
+    <ClCompile Include="..\..\src\common\memory.cpp" />
+    <ClCompile Include="..\..\src\common\thread.cpp" />
+    <ClCompile Include="..\..\src\common\util.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
+    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
+    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsample.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
+    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\interface\IWelsVP.h" />
+    <ClInclude Include="..\..\src\common\resource.h" />
+    <ClInclude Include="..\..\src\common\cpu.h" />
+    <ClInclude Include="..\..\src\common\memory.h" />
+    <ClInclude Include="..\..\src\common\thread.h" />
+    <ClInclude Include="..\..\src\common\typedef.h" />
+    <ClInclude Include="..\..\src\common\util.h" />
+    <ClInclude Include="..\..\src\common\version.h" />
+    <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
+    <ClInclude Include="..\..\src\denoise\denoise.h" />
+    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
+    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
+    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
+    <ClInclude Include="..\..\src\downsample\downsample.h" />
+    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
+    <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\src\common\WelsVP.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\vaa.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2012.vcxproj.filters
@@ -1,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\..\interface\IWelsVP.h">
+      <Filter>headers</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\cpu.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\denoise\denoise.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\downsample\downsample.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\memory.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\thread.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\util.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
+      <Filter>sources</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\cpu.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\denoise\denoise.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\downsample\downsample.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\memory.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\resource.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\thread.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\typedef.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\util.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\version.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\common\WelsFrameWork.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\cpuid.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\sad.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\vaa.asm">
+      <Filter>ASM</Filter>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="ASM">
+      <UniqueIdentifier>{18a2a593-cf54-452e-bf69-5eaf9aac6518}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="headers">
+      <UniqueIdentifier>{5a921557-4f54-4838-80de-8c517b1d099b}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="sources">
+      <UniqueIdentifier>{0b628696-109b-4a2e-b11f-5e9e006b76ae}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="resources">
+      <UniqueIdentifier>{94dba5f3-1b39-4ccd-891b-6a70cb59f210}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\src\common\WelsVP.rc">
+      <Filter>resources</Filter>
+    </ResourceCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\src\common\WelsVP.def">
+      <Filter>resources</Filter>
+    </None>
+  </ItemGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVideoProcessor.sln
@@ -1,0 +1,29 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVideoProcessor", "WelsVideoProcessor.vcproj", "{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
+	ProjectSection(ProjectDependencies) = postProject
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.Build.0 = Debug|Win32
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.ActiveCfg = Release|Win32
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.Build.0 = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/processing/build/win32/WelsVideoProcessor.vcproj
@@ -1,0 +1,213 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsVideoProcessor"
+	ProjectGUID="{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
+	RootNamespace="WelsVideoProcessor"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
+			IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
+			IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\..\src\testbed\stdafx.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\wels_process.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\WelsVideoProcessor.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\..\src\testbed\stdafx.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\targetver.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\wels_process.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/processing/interface/IWelsVP.h
@@ -1,0 +1,286 @@
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	    :  IWelsVP.h
+ *
+ * \brief	    :  Interface of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. should support both C/C++ style interface
+ *                 2. should concern with the feature extension requirement
+ *                 3. should care the usage of "char"==>
+ *                     1) value char  : signed char/unsigned char
+ *                     2) string char : char
+ *
+ *************************************************************************************
+ */
+
+#ifndef IWELSVP_H_
+#define IWELSVP_H_
+
+#ifdef _WIN32
+#define WELSAPI __stdcall
+#else
+#define WELSAPI
+#endif
+
+#define WELSVP_MAJOR_VERSION   1
+#define WELSVP_MINOR_VERSION   1
+#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
+
+typedef enum {
+  RET_SUCCESS          =  0,
+  RET_FAILED           = -1,
+  RET_INVALIDPARAM     = -2,
+  RET_OUTOFMEMORY      = -3,
+  RET_NOTSUPPORTED       = -4,
+  RET_UNEXPECTED       = -5,
+  RET_NEEDREINIT		  = -6
+} EResult;
+
+typedef enum {
+  VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
+  /*rgb color formats*/
+  VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
+  VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
+  VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
+  VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
+  VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
+  VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
+  VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
+  VIDEO_FORMAT_ARGB       = 8,   /* argb             */
+
+  /*yuv color formats*/
+  VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
+  VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
+  VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
+  VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */
+  VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
+  VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */
+  VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
+  VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
+  VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
+  VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
+
+  VIDEO_FORMAT_RGB24      = 1,
+  VIDEO_FORMAT_RGB32      = 2,
+  VIDEO_FORMAT_RGB24_INV  = 5,
+  VIDEO_FORMAT_RGB32_INV  = 6,
+  VIDEO_FORMAT_RGB555_INV = 7,
+  VIDEO_FORMAT_RGB565_INV = 8,
+  VIDEO_FORMAT_YUV2       = 21,
+  VIDEO_FORMAT_420        = 23,
+
+  VIDEO_FORMAT_VFlip      = 0x80000000
+} EVideoFormat;
+
+typedef enum {
+  BUFFER_HOSTMEM  = 0,
+  BUFFER_SURFACE
+} EPixMapBufferProperty;
+
+typedef struct {
+  int iRectTop;
+  int iRectLeft;
+  int iRectWidth;
+  int iRectHeight;
+} SRect;
+
+typedef struct {
+  void*        pPixel[3];
+  int          iSizeInBits;
+  int          iStride[3];
+  SRect        sRect;
+  EVideoFormat eFormat;
+  EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
+} SPixMap;
+
+typedef enum {
+  METHOD_NULL              = 0,
+  METHOD_COLORSPACE_CONVERT    ,//not support yet
+  METHOD_DENOISE              ,
+  METHOD_SCENE_CHANGE_DETECTION ,
+  METHOD_DOWNSAMPLE			  ,
+  METHOD_VAA_STATISTICS        ,
+  METHOD_BACKGROUND_DETECTION  ,
+  METHOD_ADAPTIVE_QUANT ,
+  METHOD_COMPLEXITY_ANALYSIS   ,
+  METHOD_IMAGE_ROTATE		  ,
+  METHOD_MASK
+} EMethods;
+
+//-----------------------------------------------------------------//
+//  Algorithm parameters define
+//-----------------------------------------------------------------//
+
+typedef struct {
+  int bSceneChangeFlag; // 0:false ; 1:true
+} SSceneChangeResult;
+
+typedef enum {
+  SIMILAR_SCENE,      //similar scene
+  MEDIUM_CHANGED_SCENE,   //medium changed scene
+  LARGE_CHANGED_SCENE,   //large changed scene
+} ESceneChangeIdc;
+
+typedef struct {
+  unsigned char* pCurY;					// Y data of current frame
+  unsigned char* pRefY;					// Y data of pRef frame for diff calc
+  int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
+  int* pSsd16x16;					// sum of square difference of 16x16
+  int* pSum16x16;					// sum of 16x16
+  int* pSumOfSquare16x16;					// sum of square of 16x16
+  int	(*pSumOfDiff8x8)[4];
+  unsigned char (*pMad8x8)[4];
+  int iFrameSad;					// sad of frame
+} SVAACalcResult;
+
+typedef struct {
+  int iCalcVar;
+  int iCalcBgd;
+  int iCalcSsd;
+  int iReserved;
+  SVAACalcResult*	pCalcResult;
+} SVAACalcParam;
+
+typedef struct {
+  signed char*		pBackgroundMbFlag;
+  SVAACalcResult*  pCalcRes;
+} SBGDInterface;
+
+typedef enum {
+  AQ_QUALITY_MODE,   //Quality mode
+  AQ_BITRATE_MODE,   //Bitrate mode
+} EAQModes;
+
+typedef struct {
+  unsigned short    uiMotionIndex;
+  unsigned short    uiTextureIndex;
+} SMotionTextureUnit;
+
+typedef struct {
+  int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
+  SVAACalcResult*		pCalcResult;
+  SMotionTextureUnit*  pMotionTextureUnit;
+
+  signed char*			pMotionTextureIndexToDeltaQp;
+  double				dAverMotionTextureIndexToDeltaQp;
+} SAdaptiveQuantizationParam;
+
+typedef enum {
+  FRAME_SAD     =  0,
+  GOM_SAD       = -1,
+  GOM_VAR       = -2
+} EComplexityAnalysisMode;
+
+typedef struct {
+  int  iComplexityAnalysisMode;
+  int  iCalcBgd;
+  int  iMbNumInGom;
+  int  iFrameComplexity;
+  int*  pGomComplexity;
+  int*  pGomForegroundBlockNum;
+  signed char*  pBackgroundMbFlag;
+  unsigned int* uiRefMbType;
+  SVAACalcResult*  pCalcResult;
+} SComplexityAnalysisParam;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+  void*    pCtx;
+  EResult (*Init) (void* pCtx, int iType, void* pCfg);
+  EResult (*Uninit) (void* pCtx, int iType);
+  EResult (*Flush) (void* pCtx, int iType);
+  EResult (*Process) (void* pCtx, int iType, SPixMap* pSrc, SPixMap* dst);
+  EResult (*Get) (void* pCtx, int iType, void* pParam);
+  EResult (*Set) (void* pCtx, int iType, void* pParam);
+  EResult (*SpecialFeature) (void* pCtx, int iType, void* pIn, void* pOut);
+} IWelsVPc;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
+
+class IWelsVP {
+ public:
+  virtual ~IWelsVP() {}
+
+ public:
+  virtual EResult Init (int iType, void* pCfg) = 0;
+  virtual EResult Uninit (int iType) = 0;
+  virtual EResult Flush (int iType) = 0;
+  virtual EResult Process (int iType, SPixMap* pSrc, SPixMap* dst) = 0;
+  virtual EResult Get (int iType, void* pParam) = 0;
+  virtual EResult Set (int iType, void* pParam) = 0;
+  virtual EResult SpecialFeature (int iType, void* pIn, void* pOut) = 0;
+};
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
+
+/* C++ interface version */
+#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN                       extern "C" {
+#define WELSVP_EXTERNC_END                         }
+
+#else    /* C style interface */
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c)
+
+/* C interface version */
+#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN
+#define WELSVP_EXTERNC_END
+
+#endif
+
+WELSVP_EXTERNC_BEGIN
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
+EResult WELSAPI DestroyVpInterface (void* pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
+WELSVP_EXTERNC_END
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+#endif // IWELSVP_H_
+
+
--- /dev/null
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -1,0 +1,256 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include "AdaptiveQuantization.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+
+#define AVERAGE_TIME_MOTION                   (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
+#define AVERAGE_TIME_TEXTURE_QUALITYMODE  (1.0) //0.5 // 1/2
+#define AVERAGE_TIME_TEXTURE_BITRATEMODE  (0.875) //0.5 // 1/2
+#define MODEL_ALPHA                           (0.9910) //1.5 //1.1102
+#define MODEL_TIME                            (5.8185) //9.0 //5.9842
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
+  m_CPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_ADAPTIVE_QUANT;
+  m_pfVar   = NULL;
+  WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
+  WelsInitVarFunc (m_pfVar, m_CPUFlag);
+}
+
+CAdaptiveQuantization::~CAdaptiveQuantization() {
+}
+
+EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbTotalNum    = iMbWidth * iMbHeight;
+
+  SMotionTextureUnit* pMotionTexture = NULL;
+  SVAACalcResult*     pVaaCalcResults = NULL;
+  int8_t   iMotionTextureIndexToDeltaQp = 0;
+  int32_t	 iAverMotionTextureIndexToDeltaQp = 0;	// double to uint32
+  double_t dAverageMotionIndex = 0.0;	// double to float
+  double_t dAverageTextureIndex = 0.0;
+
+  double_t dQStep = 0.0;
+  double_t dLumaMotionDeltaQp = 0;
+  double_t dLumaTextureDeltaQp = 0;
+
+  uint8_t* pRefFrameY = NULL, *pCurFrameY = NULL;
+  int32_t iRefStride = 0, iCurStride = 0;
+
+  uint8_t* pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
+  int32_t i = 0, j = 0;
+
+  pRefFrameY = (uint8_t*)pRefPixMap->pPixel[0];
+  pCurFrameY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+  iRefStride  = pRefPixMap->iStride[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  /////////////////////////////////////// motion //////////////////////////////////
+  //  motion MB residual variance
+  dAverageMotionIndex = 0.0;
+  dAverageTextureIndex = 0.0;
+  pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+  pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
+
+  if (pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY) {
+    int32_t iMbIndex = 0;
+    int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
+    for (j = 0; j < iMbHeight; j ++) {
+      pRefFrameTmp  = pRefFrameY;
+      pCurFrameTmp  = pCurFrameY;
+      for (i = 0; i < iMbWidth; i++) {
+        iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
+        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
+        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
+        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
+
+        iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
+        uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
+        iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
+
+        iSumDiff = iSumDiff >> 8;
+        pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
+
+        uiSum = uiSum >> 8;
+        pMotionTexture->uiTextureIndex = (iSQSum >> 8) - (uiSum * uiSum);
+
+        dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+        dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+        pMotionTexture++;
+        ++iMbIndex;
+        pRefFrameTmp += MB_WIDTH_LUMA;
+        pCurFrameTmp += MB_WIDTH_LUMA;
+      }
+      pRefFrameY += (iRefStride) << 4;
+      pCurFrameY += (iCurStride) << 4;
+    }
+  } else {
+    for (j = 0; j < iMbHeight; j ++) {
+      pRefFrameTmp  = pRefFrameY;
+      pCurFrameTmp  = pCurFrameY;
+      for (i = 0; i < iMbWidth; i++) {
+        m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
+        dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+        dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+        pMotionTexture++;
+        pRefFrameTmp += MB_WIDTH_LUMA;
+        pCurFrameTmp += MB_WIDTH_LUMA;
+
+      }
+      pRefFrameY += (iRefStride) << 4;
+      pCurFrameY += (iCurStride) << 4;
+    }
+  }
+  dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
+  dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
+  if ((dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN)) {
+    dAverageMotionIndex = 1.0;
+  }
+  if ((dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN)) {
+    dAverageTextureIndex = 1.0;
+  }
+  //  motion mb residual map to QP
+  //  texture mb original map to QP
+  iAverMotionTextureIndexToDeltaQp = 0;
+  dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
+
+  if (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE) {
+    dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
+  } else {
+    dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
+  }
+
+  pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+  for (j = 0; j < iMbHeight; j ++) {
+    for (i = 0; i < iMbWidth; i++) {
+      double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
+      dQStep = (a - 1) / (a + MODEL_ALPHA);
+      dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+      iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
+
+      a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
+      dQStep = (a - 1) / (a + MODEL_ALPHA);
+      dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+      if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN)
+          || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE)) {
+        iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
+      }
+
+      m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
+      iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
+      pMotionTexture++;
+    }
+  }
+  m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
+
+  eReturn = RET_SUCCESS;
+
+  return eReturn;
+}
+
+
+
+EResult CAdaptiveQuantization::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  m_sAdaptiveQuantParam = * (SAdaptiveQuantizationParam*)pParam;
+
+  return RET_SUCCESS;
+}
+
+EResult CAdaptiveQuantization::Get (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  SAdaptiveQuantizationParam* sAdaptiveQuantParam = (SAdaptiveQuantizationParam*)pParam;
+
+  sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
+
+  return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar,  int32_t iCpuFlag) {
+  pfVar = SampleVariance16x16_c;
+
+#ifdef X86_ASM
+  if (iCpuFlag & WELS_CPU_SSE2) {
+   // pfVar = SampleVariance16x16_sse2;
+  }
+#endif
+}
+
+void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
+                            SMotionTextureUnit* pMotionTexture) {
+  uint32_t uiCurSquare = 0,  uiSquare = 0;
+  uint16_t uiCurSum = 0,  uiSum = 0;
+
+  for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {
+    for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {
+      uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);
+      uiSum += uiDiff;
+      uiSquare += uiDiff * uiDiff;
+
+      uiCurSum += pSrcY[x];
+      uiCurSquare += pSrcY[x] * pSrcY[x];
+    }
+    pRefY += iRefStride;
+    pSrcY += iSrcStride;
+  }
+
+  uiSum = uiSum >> 8;
+  pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);
+
+  uiCurSum = uiCurSum >> 8;
+  pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	        :  AdaptiveQuantization.h
+ *
+ * \brief	    :  adaptive quantization class of wels video processor class
+ *
+ * \date         :  2011/03/21
+ *
+ * \description  :  1. rewrite the package code of scene change detection class
+ *
+ */
+
+#ifndef WELSVP_ADAPTIVEQUANTIZATION_H
+#define WELSVP_ADAPTIVEQUANTIZATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VarFunc) (uint8_t* pRefY, int32_t iRefStrideY, uint8_t* pSrc, int32_t iSrcStrideY,
+                        SMotionTextureUnit* pMotionTexture);
+
+typedef VarFunc*   PVarFunc;
+
+VarFunc      SampleVariance16x16_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VarFunc      SampleVariance16x16_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+
+class CAdaptiveQuantization : public IStrategy {
+ public:
+  CAdaptiveQuantization (int32_t iCpuFlag);
+  ~CAdaptiveQuantization();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Set (int32_t iType, void* pParam);
+  EResult Get (int32_t iType, void* pParam);
+
+ private:
+  void WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag);
+
+ private:
+  PVarFunc			                   m_pfVar;
+  int32_t                                  m_CPUFlag;
+  SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -1,0 +1,279 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1
+		paddusw		%4,	%1
+		paddusw		%5,	%2
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+%define		pushsize	4
+;%define		pixel		esp + pushsize + 4
+;%define		stride		esp + pushsize + 8
+;%define         pixel  r0
+;%define         stride r1
+
+BilateralLumaFilter8_sse2:
+       
+        push r3 
+        %assign push_num 1
+        LOAD_2_PARA
+
+		pxor		xmm7,	xmm7
+	
+		mov         r3,     r0
+		
+		movq        xmm6,   [r0]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+
+        dec         r0
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 5
+
+		sub			r0,	r1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 3
+
+		lea			r0,	[r0 + r1 * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 8
+
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[r3],	xmm5
+
+       
+		pop r3
+		%assign push_num 0
+		
+		ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+
+        push r3
+       
+        %assign push_num 1
+        
+        LOAD_2_PARA
+        
+        mov		r3,	r1
+		add		r3,	r3
+		sub		r0,	r3			; pixels - 2 * stride
+		sub		r0,	2
+
+		pxor	xmm0,	xmm0
+		pxor	xmm3,	xmm3
+
+		movdqu		xmm1,	[r0]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[r0 + r1]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		add		r0,	r3
+		movdqu		xmm1,	[r0]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[r0 + r1]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		movdqu		xmm1,	[r0 + r1 * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[r0 + 2],		xmm3
+
+              
+        pop r3
+        
+        %assign push_num 0
+		ret
--- /dev/null
+++ b/codec/processing/src/asm/downsample_bilinear.asm
@@ -1,0 +1,1225 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	upsampling.asm
+;*
+;*  Abstract
+;*		SIMD for pixel domain down sampling
+;*
+;*  History
+;*		10/22/2009	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+%ifdef X86_32
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	; 2nd part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm1, [esi+16]		; 1st pSrc line + 16
+	movq mm2, [esi+24]		; 1st pSrc line + 24
+	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
+	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+
+	; to handle mm1, mm2, mm3, mm4
+	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm5, mm6		; d c D C b a B A
+	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm6, mm7		; h g H G f e F E
+	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm7, mm1		; l k L K j i J I
+	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+
+	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm1, mm2 		; p o P O n m N M
+	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+
+	; to handle mm5, mm6, mm7, mm1
+	movq mm2, mm5
+	punpckldq mm2, mm6 	; H G F E D C B A
+	punpckhdq mm5, mm6 	; h g f e d c b a
+
+	movq mm3, mm7
+	punpckldq mm3, mm1 	; P O N M L K J I
+	punpckhdq mm7, mm1 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+	movq [edi  ], mm0
+	movq [edi+8], mm2
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		;
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movq [edi  ], mm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 8 bytes
+.xloops:
+	; 1st part horizonal loop: x8 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A
+	;2nd Line Src:	mm1: h H g G f F e E
+	;=> target:
+	;: H G F E D C B A
+	;: h g f e d c b a
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+ecx]		; 2nd pSrc line
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm2, mm3		; d c D C b a B A
+	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm4, mm5		; h g H G f e F E
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	; to handle mm2, mm4
+	movq mm0, mm2		;
+	punpckldq mm0, mm4 	; H G F E D C B A
+	punpckhdq mm2, mm4 	; h g f e d c b a
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+	pshufw mm1, mm0, 04eh	; 01001110 B
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movd [edi],	mm0
+
+	; next unit
+	lea esi, [esi+8]
+	lea edi, [edi+4]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm4 high bits
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm2 high bits
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride
+	mov ebp, [esp+44]	; iSrcHeight
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
+
+	; write pDst
+	movq [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+
+
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	32767
+	mov		eax,	[uiScaleX]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
+	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
+	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		40003fffh
+	movd	xmm5,		edx
+	punpcklwd	xmm5,	xmm0					; 16384 16383
+	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,			16384
+	mov		[yInverse],		eax
+
+	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+
+WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	pxor	xmm0,		xmm0
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	movdqa	xmm0,	xmm2
+	pmuludq	xmm2,	xmm1
+	psrlq	xmm0,	32
+	psrlq	xmm1,	32
+	pmuludq	xmm0,	xmm1
+	paddq	xmm2,	xmm0
+	pshufd	xmm1,	xmm2,	00001110b
+	paddq	xmm2,	xmm1
+	psrlq	xmm2,	29
+
+	movd	eax,	xmm2
+	inc		eax
+	shr		eax,	1
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+	psllw	xmm3,		1
+	psrlw	xmm3,		1
+
+	loop	WIDTH
+
+WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		HEIGHT
+
+
+LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+
+
+
+
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+
+	pxor	xmm0,	xmm0
+	mov		edx,	65535
+	mov		eax,	[uiScaleX]
+	and		eax,	edx
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	65535
+	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 uinc 0 -uinc
+	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 vinc 0 -vinc
+	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+
+	mov		edx,		80007fffh				; 32768 32767
+	movd	xmm5,		edx
+	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
+	mov		ebx,		16384
+
+
+FAST_DOWNSAMPLE:
+
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,		16384
+	mov		[yInverse],		eax
+
+	pshuflw	xmm4,		xmm5,	01010000b
+	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+
+	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	pmaddwd		xmm2,	xmm1
+	pshufd	xmm1,	xmm2,	00000001b
+	paddd	xmm2,	xmm1
+	movd	xmm1,	ebx
+	paddd	xmm2,	xmm1
+	psrld	xmm2,	15
+
+	packuswb	xmm2,	xmm0
+	movd	eax,	xmm2
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	paddw	xmm3,		xmm7			; inc u
+
+	loop	FAST_WIDTH
+
+FAST_WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+
+	dec		dword [tmpHeight]
+	jg		FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+
+	loop	FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+%endif
--- /dev/null
+++ b/codec/processing/src/asm/intra_pred.asm
@@ -1,0 +1,1505 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes:		times 16	db 1
+;align 16
+;sse_0x0004bytes:	times 8		dw 4
+;ALIGN 16
+;sse_f000 db  255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+
+;***********************************************************************
+; macros
+;***********************************************************************
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+;%1 will keep the last result
+%macro SSE_DB_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubb %1, %2
+%endmacro
+
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+    movd		%1,	[%4-1]
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+
+	;add			%4,	%5
+	movd		%2,	[%4+%5-1]
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	punpckldq	%1,	%2
+%endmacro
+
+%macro  SUMW_HORIZON1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro	LOAD_COLUMN 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpcklwd %1,	%3
+		lea		%5,	[%5+2*%6]
+		movd	%4,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %4,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		lea		%5,	[%5+2*%6]
+		punpcklbw %3,	%2
+		punpcklwd %4,	%3
+		punpckhdq %1,	%4
+%endmacro
+
+%macro  SUMW_HORIZON 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro	LOAD_COLUMN_C 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpckhwd %1,	%3
+		lea		%5,	[%5+2*%6]
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+        lea         r1, [r1+2*r2]
+        movzx		r4, byte [r1-0x01]
+        add			r3, r4
+        movzx		r4, byte [r1+r2-0x01]
+        add			r3, r4
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+WELS_EXTERN WelsI4x4LumaPredH_sse2
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
+
+ALIGN 16
+;***********************************************************************
+;   void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;
+;	pred must align to 16
+;***********************************************************************
+WelsI4x4LumaPredH_sse2:
+	push r3
+	%assign push_num 1
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movzx		r3,	byte [r1-1]
+	movd		xmm0,	r3d
+	pmuludq		xmm0,	[mmx_01bytes]
+
+	movzx		r3,	byte [r1+r2-1]
+	movd		xmm1,	r3d
+	pmuludq		xmm1,	[mmx_01bytes]
+
+	unpcklps	xmm0,	xmm1
+
+	lea			r1,	[r1+r2*2]
+	movzx		r3,	byte [r1-1]
+	movd		xmm2,	r3d
+	pmuludq		xmm2,	[mmx_01bytes]
+
+	movzx		r3,	byte [r1+r2-1]
+	movd		xmm3,	r3d
+	pmuludq		xmm3,	[mmx_01bytes]
+
+	unpcklps	xmm2,	xmm3
+	unpcklpd	xmm0,	xmm2
+
+	movdqa		[r0],	xmm0
+	pop r3
+	ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WelsI16x16LumaPredPlane_sse2:
+		;%define pushsize	4
+		;push	esi
+		;mov		esi,	[esp + pushsize + 8]
+		;mov		ecx,	[esp + pushsize + 12]
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_3_PARA
+		%ifndef X86_32
+		movsx r2, r2d
+		%endif
+		sub		r1,	1
+		sub		r1,	r2
+
+		;for H
+		pxor	xmm7,	xmm7
+		movq	xmm0,	[r1]
+		movdqa	xmm5,	[sse2_plane_dec]
+		punpcklbw xmm0,	xmm7
+		pmullw	xmm0,	xmm5
+		movq	xmm1,	[r1 + 9]
+		movdqa	xmm6,	[sse2_plane_inc]
+		punpcklbw xmm1,	xmm7
+		pmullw	xmm1,	xmm6
+		psubw	xmm1,	xmm0
+
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	r3,	r3w
+		imul	r3,	5
+		add		r3,	32
+		sar		r3,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
+
+		movzx	r4,	BYTE [r1+16]
+		sub	r1, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
+
+		add		r1,	3
+		movzx	r3,	BYTE [r1+8*r2]
+		add		r4,	r3
+		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
+
+		sub	r1, 3
+		add		r1,	r2
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
+		pxor	xmm4,	xmm4
+		punpckhbw xmm0,	xmm4
+		pmullw	xmm0,	xmm5
+		punpckhbw xmm7,	xmm4
+		pmullw	xmm7,	xmm6
+		psubw	xmm7,	xmm0
+
+		SUMW_HORIZON   xmm7,xmm0,xmm2
+		movd    r3d,   xmm7			; V
+		movsx	r3,	r3w
+		imul	r3,	5
+		add		r3,	32
+		sar		r3,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
+
+		;mov		esi,	[esp + pushsize + 4]
+		add		r4,	16
+		imul	r3,	-7
+		add		r3,	r4				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+
+		xor		r3,	r3
+		movdqa	xmm5,	[sse2_plane_inc_minus]
+
+get_i16x16_luma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		movdqa	xmm3,	xmm1
+		pmullw	xmm3,	xmm6
+		paddw	xmm3,	xmm0
+		psraw	xmm3,	5
+		packuswb xmm2,	xmm3
+		movdqa	[r0],	xmm2
+		paddw	xmm0,	xmm4
+		add		r0,	16
+		inc		r3
+		cmp		r3,	16
+		jnz get_i16x16_luma_pred_plane_sse2_1
+		pop r4
+		pop r3
+		ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+	add r0, 16
+	add r1, r2
+	movzx r3, byte [r1]
+	SSE2_Copy16Times xmm0, r3d
+	movdqa [r0], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+	push r3
+	%assign push_num 1
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	dec r1
+	movzx r3, byte [r1]
+	SSE2_Copy16Times xmm0, r3d
+	movdqa [r0], xmm0
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	SSE2_PRED_H_16X16_ONE_LINE
+	pop r3
+    ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    ;mov     edx, [esp+4]    ; pred
+    ;mov     eax, [esp+8]	; pRef
+    ;mov     ecx, [esp+12]   ; stride
+    %assign push_num 0
+    LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+    sub     r1, r2
+    movdqa  xmm0, [r1]
+
+    movdqa  [r0], xmm0
+    movdqa  [r0+10h], xmm0
+    movdqa  [r0+20h], xmm0
+    movdqa  [r0+30h], xmm0
+    movdqa  [r0+40h], xmm0
+    movdqa  [r0+50h], xmm0
+    movdqa  [r0+60h], xmm0
+    movdqa  [r0+70h], xmm0
+    movdqa  [r0+80h], xmm0
+    movdqa  [r0+90h], xmm0
+    movdqa  [r0+160], xmm0
+    movdqa  [r0+176], xmm0
+    movdqa  [r0+192], xmm0
+    movdqa  [r0+208], xmm0
+    movdqa  [r0+224], xmm0
+    movdqa  [r0+240], xmm0
+
+    ret
+
+;***********************************************************************
+; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredPlane_sse2
+WelsIChromaPredPlane_sse2:
+		;%define pushsize	4
+		;push	esi
+		;mov		esi,	[esp + pushsize + 8]	;pRef
+		;mov		ecx,	[esp + pushsize + 12]	;stride
+		push r3
+		push r4
+		%assign push_num 2
+		LOAD_3_PARA
+		%ifndef X86_32
+		movsx r2, r2d
+		%endif
+		sub		r1,	1
+		sub		r1,	r2
+
+		pxor	mm7,	mm7
+		movq	mm0,	[r1]
+		movq	mm5,	[sse2_plane_dec_c]
+		punpcklbw mm0,	mm7
+		pmullw	mm0,	mm5
+		movq	mm1,	[r1 + 5]
+		movq	mm6,	[sse2_plane_inc_c]
+		punpcklbw mm1,	mm7
+		pmullw	mm1,	mm6
+		psubw	mm1,	mm0
+
+		movq2dq xmm1,   mm1
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    r3d,	xmm1
+		movsx	r3,	r3w
+		imul	r3,	17
+		add		r3,	16
+		sar		r3,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
+
+		movzx	r3,	BYTE [r1+8]
+		sub	r1, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
+
+		add		r1,	3
+		movzx	r4,	BYTE [r1+4*r2]
+		add		r4,	r3
+		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
+
+		sub	r1, 3
+		add		r1,	r2
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
+		pxor	mm4,	mm4
+		punpckhbw mm0,	mm4
+		pmullw	mm0,	mm5
+		punpckhbw mm7,	mm4
+		pmullw	mm7,	mm6
+		psubw	mm7,	mm0
+
+		movq2dq xmm7,   mm7
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm7,xmm0,xmm2
+		movd    r3d,    xmm7			; V
+		movsx	r3,	r3w
+		imul	r3,	17
+		add		r3,	16
+		sar		r3,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
+
+		;mov		esi,	[esp + pushsize + 4]
+		add		r4,	16
+		imul	r3,	-3
+		add		r3,	r4		; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
+
+		xor		r3,	r3
+		movdqa	xmm5,	[sse2_plane_mul_b_c]
+
+get_i_chroma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		packuswb xmm2,	xmm2
+		movq	[r0],	xmm2
+		paddw	xmm0,	xmm4
+		add		r0,	8
+		inc		r3
+		cmp		r3,	8
+		jnz get_i_chroma_pred_plane_sse2_1
+		pop r4
+		pop r3
+		WELSEMMS
+		ret
+
+ALIGN 16
+;***********************************************************************
+;	0 |1 |2 |3 |4 |
+;	6 |7 |8 |9 |10|
+;	11|12|13|14|15|
+;	16|17|18|19|20|
+;	21|22|23|24|25|
+;	7 is the start pixel of current 4x4 block
+;	pred[7] = ([6]+[0]*2+[1]+2)/4
+;
+;   void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WelsI4x4LumaPredDDR_mmx:
+	;mov			edx,[esp+4]			;pred
+	;mov         eax,[esp+8]			;pRef
+	;mov			ecx,[esp+12]		;stride
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
+	sub		r1, r2			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+	psllq       mm3,18h				;mm3[5]=[1]
+	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
+	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
+	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[16]
+	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
+	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[21]
+	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
+	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
+	pand        mm1,[mmx_01bytes]	;set the odd bit
+	psubusb     mm3,mm1				;decrease 1 from odd bytes
+	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+
+	movd        [r0+12],mm2
+	psrlq       mm2,8
+	movd        [r0+8],mm2
+	psrlq       mm2,8
+	movd        [r0+4],mm2
+	psrlq       mm2,8
+	movd        [r0],mm2
+	WELSEMMS
+	ret
+
+ALIGN 16
+;***********************************************************************
+;	0 |1 |2 |3 |4 |
+;	5 |6 |7 |8 |9 |
+;	10|11|12|13|14|
+;	15|16|17|18|19|
+;	20|21|22|23|24|
+;	6 is the start pixel of current 4x4 block
+;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;
+;   void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WelsI4x4LumaPredDc_sse2:
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movzx		r4,	byte [r1-1h]
+	sub			r1,	r2
+	movd		xmm0,	[r1]
+	pxor		xmm1,	xmm1
+	psadbw		xmm0,	xmm1
+	xor r3, r3
+	movd		r3d,	xmm0
+	add			r3,	r4
+	movzx		r4,	byte [r1+r2*2-1h]
+	add			r3,	r4
+
+	lea			r1,	[r1+r2*2-1]
+	movzx		r4,	byte [r1+r2]
+	add			r3,	r4
+
+	movzx		r4,	byte [r1+r2*2]
+	add			r3,	r4
+	add			r3,	4
+	sar			r3,	3
+	imul		r3,	0x01010101
+
+	movd		xmm0,	r3d
+	pshufd		xmm0,	xmm0,	0
+	movdqa		[r0],	xmm0
+	pop r4
+	pop r3
+	ret
+
+ALIGN 16
+;***********************************************************************
+;	void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy 8 pixel of 8 line from left
+;***********************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+	movq		%1,		[%3-8]
+	psrlq		%1,		38h
+
+	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+	movq		%1,		[%3+r2-8]
+	psrlq		%1,		38h
+
+	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+WELS_EXTERN WelsIChromaPredH_mmx
+WelsIChromaPredH_mmx:
+	;mov			edx,	[esp+4]			;pred
+	;mov         eax,	[esp+8]			;pRef
+	;mov			ecx,	[esp+12]		;stride
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movq		mm0,	[r1-8]
+	psrlq		mm0,	38h
+
+	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
+	pmullw		mm0,		[mmx_01bytes]
+	pshufw		mm0,	mm0,	0
+	movq		[r0],	mm0
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
+
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
+
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
+
+	lea			r1,[r1+r2*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
+	WELSEMMS
+	ret
+
+ALIGN 16
+;***********************************************************************
+;	void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy pixels from top 4 pixels
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredV_sse2
+WelsI4x4LumaPredV_sse2:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub			r1,	r2
+	movd		xmm0,	[r1]
+	pshufd		xmm0,	xmm0,	0
+	movdqa		[r0],	xmm0
+	ret
+
+ALIGN 16
+;***********************************************************************
+;	void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy 8 pixels from top 8 pixels
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredV_sse2
+WelsIChromaPredV_sse2:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub		r1,		r2
+	movq		xmm0,		[r1]
+	movdqa		xmm1,		xmm0
+	punpcklqdq	xmm0,		xmm1
+	movdqa		[r0],		xmm0
+	movdqa		[r0+16],	xmm0
+	movdqa		[r0+32],	xmm0
+	movdqa		[r0+48],	xmm0
+	ret
+
+	ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |a |b |
+;	|g |h |e |f |
+;	|i |j |g |h |
+
+;   a = (1 + lt + l0)>>1
+;   e = (1 + l0 + l1)>>1
+;   g = (1 + l1 + l2)>>1
+;   i = (1 + l2 + l3)>>1
+
+;   d = (2 + t0 + (t1<<1) + t2)>>2
+;   c = (2 + lt + (t0<<1) + t1)>>2
+;   b = (2 + l0 + (lt<<1) + t0)>>2
+
+;   f = (2 + l1 + (l0<<1) + lt)>>2
+;   h = (2 + l2 + (l1<<1) + l0)>>2
+;   j = (2 + l3 + (l2<<1) + l1)>>2
+;   [b a f e h g j i] + [d c b a] --> mov to memory
+;
+;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHD_mmx
+WelsI4x4LumaPredHD_mmx:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+
+	movd        mm1, [r1+2*r2-4]
+	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r1, [r1+2*r2]
+	movd        mm2, [r1+2*r2-4]
+	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+	psrlq       mm2, 20h
+	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+
+	movq        mm1, mm0
+	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+	movq        mm2, mm0
+	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+	movq        mm3, mm2
+	movq        mm4, mm1
+	pavgb       mm1, mm0
+
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm4				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+
+	movq        mm4, mm0
+	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+
+	psrlq       mm2, 20h
+	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+	movq        mm4, mm3
+	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+
+	movd        [r0], mm2
+	movd        [r0+12], mm3
+	psrlq       mm3, 10h
+	movd        [r0+8], mm3
+	psrlq       mm3, 10h
+	movd        [r0+4], mm3
+	WELSEMMS
+	ret
+
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|c |d |e |f |
+;	|e |f |g |g |
+;	|g |g |g |g |
+
+;   a = (1 + l0 + l1)>>1
+;   c = (1 + l1 + l2)>>1
+;   e = (1 + l2 + l3)>>1
+;   g = l3
+
+;   b = (2 + l0 + (l1<<1) + l2)>>2
+;   d = (2 + l1 + (l2<<1) + l3)>>2
+;   f = (2 + l2 + (l3<<1) + l3)>>2
+
+;   [g g f e d c b a] + [g g g g] --> mov to memory
+;
+;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHU_mmx
+WelsI4x4LumaPredHU_mmx:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	movd        mm0, [r1-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         r1, [r1+2*r2]
+	movd        mm2, [r1-4]            ; mm2[3] = l2
+	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
+	punpcklbw   mm2, mm4
+	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+
+	psrlq       mm4, 18h
+	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+	psrlq       mm0, 8h
+	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+	movq        mm5, mm2
+	pavgb       mm2, mm0
+
+	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
+	pand        mm5, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm5				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+
+	psrlq       mm2, 8h
+	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+
+	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+
+	psrlq       mm4, 20h
+	movd        [r0+12], mm4
+
+	movd        [r0], mm1
+	psrlq       mm1, 10h
+	movd        [r0+4], mm1
+	psrlq       mm1, 10h
+	movd        [r0+8], mm1
+	WELSEMMS
+	ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	l3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|i |a |b |c |
+;	|j |e |f |g |
+
+;   a = (1 + lt + t0)>>1
+;   b = (1 + t0 + t1)>>1
+;   c = (1 + t1 + t2)>>1
+;   d = (1 + t2 + t3)>>1
+
+;   e = (2 + l0 + (lt<<1) + t0)>>2
+;   f = (2 + lt + (t0<<1) + t1)>>2
+;   g = (2 + t0 + (t1<<1) + t2)>>2
+
+;   h = (2 + t1 + (t2<<1) + t3)>>2
+;   i = (2 + lt + (l0<<1) + l1)>>2
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
+;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVR_mmx
+WelsI4x4LumaPredVR_mmx:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+
+	movd        mm1, [r1+2*r2-4]
+	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+	lea         r1, [r1+2*r2]
+	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+	psrlq       mm2, 28h
+	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+	movq        mm3, mm2
+	pavgb       mm2, mm0
+
+	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm3				; decrease 1 from odd bytes
+
+	movq        mm3, mm0
+	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+	movq        mm2, mm3
+
+	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+	movd        [r0], mm1
+
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+	movd        [r0+4], mm2
+
+	movq        mm4, mm3
+	psllq       mm4, 20h
+	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+
+	movq        mm5, mm3
+	psllq       mm5, 28h
+	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+
+	psllq       mm1, 8h
+	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+	movd        [r0+8], mm4
+
+	psllq       mm2, 8h
+	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+	movd        [r0+12], mm5
+	WELSEMMS
+	ret
+
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|b |c |d |e |
+;	|c |d |e |f |
+;	|d |e |f |g |
+
+;   a = (2 + t0 + t2 + (t1<<1))>>2
+;   b = (2 + t1 + t3 + (t2<<1))>>2
+;   c = (2 + t2 + t4 + (t3<<1))>>2
+;   d = (2 + t3 + t5 + (t4<<1))>>2
+
+;   e = (2 + t4 + t6 + (t5<<1))>>2
+;   f = (2 + t5 + t7 + (t6<<1))>>2
+;   g = (2 + t6 + t7 + (t7<<1))>>2
+
+;   [g f e d c b a] --> mov to memory
+;
+;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDL_mmx
+WelsI4x4LumaPredDDL_mmx:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+
+	movq        mm3, mm0
+	psrlq       mm3, 38h
+	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+
+	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+	psrlq       mm2, 8h
+	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+	movq        mm3, mm1
+	pavgb       mm1, mm2
+	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm3				; decrease 1 from odd bytes
+
+	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+
+	psrlq       mm0, 8h
+	movd        [r0], mm0
+	psrlq       mm0, 8h
+	movd        [r0+4], mm0
+	psrlq       mm0, 8h
+	movd        [r0+8], mm0
+	psrlq       mm0, 8h
+	movd        [r0+12], mm0
+	WELSEMMS
+	ret
+
+
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|b |c |d |i |
+;	|f |g |h |j |
+
+;   a = (1 + t0 + t1)>>1
+;   b = (1 + t1 + t2)>>1
+;   c = (1 + t2 + t3)>>1
+;   d = (1 + t3 + t4)>>1
+;   i = (1 + t4 + t5)>>1
+
+;   e = (2 + t0 + (t1<<1) + t2)>>2
+;   f = (2 + t1 + (t2<<1) + t3)>>2
+;   g = (2 + t2 + (t3<<1) + t4)>>2
+;   h = (2 + t3 + (t4<<1) + t5)>>2
+;   j = (2 + t4 + (t5<<1) + t6)>>2
+
+;   [i d c b a] + [j h g f e] --> mov to memory
+;
+;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVL_mmx
+WelsI4x4LumaPredVL_mmx:
+	%assign push_num 0
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+
+	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+	movq        mm3, mm1
+	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+
+	movq        mm4, mm2
+	pavgb       mm2, mm0
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm4				; decrease 1 from odd bytes
+
+	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+
+	movd        [r0], mm3
+	psrlq       mm3, 8h
+	movd        [r0+8], mm3
+
+	movd        [r0+4], mm2
+	psrlq       mm2, 8h
+	movd        [r0+12], mm2
+	WELSEMMS
+	ret
+
+ALIGN 16
+;***********************************************************************
+;
+;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredDc_sse2
+WelsIChromaPredDc_sse2:
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movq        mm0, [r1]
+
+	movzx		r3, byte [r1+r2-0x01] ; l1
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l2
+	add		r3, r4
+	movzx		r4, byte [r1+r2-0x01] ; l3
+	add		r3, r4
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l4
+	add		r3, r4
+	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
+
+	movzx		r3, byte [r1+r2-0x01] ; l5
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l6
+	add		r3, r4
+	movzx		r4, byte [r1+r2-0x01] ; l7
+	add		r3, r4
+	lea         	r1, [r1+2*r2]
+	movzx		r4, byte [r1-0x01]     ; l8
+	add		r3, r4
+	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
+
+	movq        mm3, mm0
+	psrlq       mm0, 0x20
+	psllq       mm3, 0x20
+	psrlq       mm3, 0x20
+	pxor		mm4, mm4
+	psadbw		mm0, mm4
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
+	paddq       mm3, mm1
+	movq        mm1, mm2
+	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+
+	movq        mm4, [mmx_0x02]
+
+	paddq       mm0, mm4
+	psrlq       mm0, 0x02
+
+	paddq       mm2, mm4
+	psrlq       mm2, 0x02
+
+	paddq       mm3, mm4
+	paddq       mm3, mm4
+	psrlq       mm3, 0x03
+
+	paddq       mm1, mm4
+	paddq       mm1, mm4
+	psrlq       mm1, 0x03
+
+	pmuludq     mm0, [mmx_01bytes]
+	pmuludq     mm3, [mmx_01bytes]
+	psllq       mm0, 0x20
+	pxor        mm0, mm3                 ; mm0 = m_up
+
+	pmuludq     mm2, [mmx_01bytes]
+	pmuludq     mm1, [mmx_01bytes]
+	psllq       mm1, 0x20
+	pxor        mm1, mm2                 ; mm2 = m_down
+
+	movq        [r0], mm0
+	movq        [r0+0x08], mm0
+	movq        [r0+0x10], mm0
+	movq        [r0+0x18], mm0
+
+	movq        [r0+0x20], mm1
+	movq        [r0+0x28], mm1
+	movq        [r0+0x30], mm1
+	movq        [r0+0x38], mm1
+
+	pop r4
+	pop r3
+	WELSEMMS
+	ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;
+;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredDc_sse2
+WelsI16x16LumaPredDc_sse2:
+	push r3
+	push r4
+	%assign push_num 2
+	LOAD_3_PARA
+	%ifndef X86_32
+	movsx r2, r2d
+	%endif
+	sub         r1, r2
+	movdqa      xmm0, [r1]             ; read one row
+	pxor		xmm1, xmm1
+	psadbw		xmm0, xmm1
+	movdqa      xmm1, xmm0
+	psrldq      xmm1, 0x08
+	pslldq      xmm0, 0x08
+	psrldq      xmm0, 0x08
+	paddw       xmm0, xmm1
+
+	movzx		r3, byte [r1+r2-0x01]
+	movzx		r4, byte [r1+2*r2-0x01]
+	add		r3, r4
+	lea         r1, [r1+r2]
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	add         r3, 0x10
+	movd        xmm1, r3d
+	paddw       xmm0, xmm1
+	psrld       xmm0, 0x05
+	pmuludq     xmm0, [mmx_01bytes]
+	pshufd      xmm0, xmm0, 0
+
+	movdqa      [r0], xmm0
+	movdqa      [r0+0x10], xmm0
+	movdqa      [r0+0x20], xmm0
+	movdqa      [r0+0x30], xmm0
+	movdqa      [r0+0x40], xmm0
+	movdqa      [r0+0x50], xmm0
+	movdqa      [r0+0x60], xmm0
+	movdqa      [r0+0x70], xmm0
+	movdqa      [r0+0x80], xmm0
+	movdqa      [r0+0x90], xmm0
+	movdqa      [r0+0xa0], xmm0
+	movdqa      [r0+0xb0], xmm0
+	movdqa      [r0+0xc0], xmm0
+	movdqa      [r0+0xd0], xmm0
+	movdqa      [r0+0xe0], xmm0
+	movdqa      [r0+0xf0], xmm0
+
+	pop r4
+	pop r3
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
+;
+;***********************************************************************
+%ifdef X86_ASM
+WELS_EXTERN WelsSmpleSatdThree4x4_sse2
+align 16
+WelsSmpleSatdThree4x4_sse2:
+	push      ebx
+	push      esi
+	push      edi
+	mov       eax,  [esp+24];p_enc
+	mov       ebx,  [esp+28];linesize_enc
+
+	; load source 4x4 samples and Hadamard transform
+    movd      xmm0, [eax]
+    movd      xmm1, [eax+ebx]
+    lea       eax , [eax+2*ebx]
+    movd      xmm2, [eax]
+    movd      xmm3, [eax+ebx]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
+
+    movdqa    xmm4, xmm0
+    paddw     xmm0, xmm3
+    psubw     xmm4, xmm3
+
+    movdqa    xmm2, xmm0
+    punpcklwd xmm0, xmm4
+    punpckhwd xmm4, xmm2
+
+	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
+
+    movdqa    xmm7, xmm0
+    paddw     xmm0, xmm5
+    psubw     xmm7, xmm5
+
+	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
+
+    ; Hadamard transform results are saved in xmm0 and xmm2
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+	; load top boundary samples: [a b c d]
+    mov       eax,  [esp+16];p_dec
+	sub		  eax,	[esp+20];linesize_dec
+	movzx     ecx,  byte [eax]
+	movzx     edx,  byte [eax+1]
+	movzx     esi,  byte [eax+2]
+	movzx     edi,  byte [eax+3]
+
+	; get the transform results of top boundary samples: [a b c d]
+	add       edx, ecx ; edx = a + b
+	add       edi, esi ; edi = c + d
+	add       ecx, ecx ; ecx = a + a
+	add       esi, esi ; esi = c + c
+	sub       ecx, edx ; ecx = a + a - a - b = a - b
+	sub       esi, edi ; esi = c + c - c - d = c - d
+	add       edi, edx ; edi = (a + b) + (c + d)
+	add       edx, edx
+	sub       edx, edi ; edx = (a + b) - (c + d)
+	add       esi, ecx ; esi = (a - b) + (c - d)
+	add       ecx, ecx
+	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
+
+	movdqa    xmm6, xmm0
+	movdqa    xmm7, xmm2
+	movd      xmm5, edi ; store the edi for DC mode
+	pxor      xmm3, xmm3
+	pxor      xmm4, xmm4
+	pinsrw    xmm3, edi, 0
+	pinsrw    xmm3, esi, 4
+	psllw     xmm3, 2
+	pinsrw    xmm4, edx, 0
+	pinsrw    xmm4, ecx, 4
+	psllw     xmm4, 2
+
+	; get the satd of H
+	psubw     xmm0, xmm3
+	psubw     xmm2, xmm4
+
+	WELS_AbsW  xmm0, xmm1
+	WELS_AbsW  xmm2, xmm1
+    paddusw        xmm0, xmm2
+    SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
+
+	; load left boundary samples: [a b c d]'
+    mov       eax,  [esp+16]
+	mov       ebx,  [esp+20]
+	movzx     ecx,  byte [eax-1]
+	movzx     edx,  byte [eax+ebx-1]
+	lea       eax , [eax+2*ebx]
+	movzx     esi,  byte [eax-1]
+	movzx     edi,  byte [eax+ebx-1]
+
+	; get the transform results of left boundary samples: [a b c d]'
+	add       edx, ecx ; edx = a + b
+	add       edi, esi ; edi = c + d
+	add       ecx, ecx ; ecx = a + a
+	add       esi, esi ; esi = c + c
+	sub       ecx, edx ; ecx = a + a - a - b = a - b
+	sub       esi, edi ; esi = c + c - c - d = c - d
+	add       edi, edx ; edi = (a + b) + (c + d)
+	add       edx, edx
+	sub       edx, edi ; edx = (a + b) - (c + d)
+	add       esi, ecx ; esi = (a - b) + (c - d)
+	add       ecx, ecx
+	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
+
+	; store the transform results in xmm3
+    movd      xmm3, edi
+	pinsrw    xmm3, edx, 1
+	pinsrw    xmm3, ecx, 2
+	pinsrw    xmm3, esi, 3
+	psllw     xmm3, 2
+
+	; get the satd of V
+	movdqa    xmm2, xmm6
+	movdqa    xmm4, xmm7
+	psubw     xmm2, xmm3
+	WELS_AbsW  xmm2, xmm1
+	WELS_AbsW  xmm4, xmm1
+    paddusw        xmm2, xmm4
+    SUMW_HORIZON1  xmm2, xmm1 ; satd of H is stored in xmm2
+
+	; DC result is stored in xmm1
+	add       edi, 4
+	movd      xmm1, edi
+	paddw     xmm1, xmm5
+	psrlw     xmm1, 3
+	movdqa    xmm5, xmm1
+	psllw     xmm1, 4
+
+    ; get the satd of DC
+    psubw          xmm6, xmm1
+    WELS_AbsW  xmm6, xmm1
+	WELS_AbsW  xmm7, xmm1
+    paddusw        xmm6, xmm7
+    SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
+
+    ; comparing order: DC H V
+    mov       edx, [esp+32]
+    movd      eax, xmm6
+    movd      edi, xmm2
+    movd      esi, xmm0
+    and       eax, 0xffff
+    shr       eax, 1
+    and       edi, 0xffff
+    shr       edi, 1
+    and       esi, 0xffff
+    shr       esi, 1
+    add       eax, [esp+40]
+    add       edi, [esp+44]
+    add       esi, [esp+48]
+    cmp       ax, di
+    jg near   not_dc
+    cmp       ax, si
+    jg near   not_dc_h
+
+    ; for DC mode
+    movd      ebx, xmm5
+    imul      ebx, 0x01010101
+    movd	  xmm5, ebx
+	pshufd    xmm5, xmm5, 0
+	movdqa    [edx], xmm5
+	mov       ebx, [esp+36]
+	mov       dword [ebx], 0x02
+	pop       edi
+    pop       esi
+    pop       ebx
+    ret
+
+not_dc:
+    cmp       di, si
+    jg near   not_dc_h
+
+    ; for H mode
+    SSE_DB_1_2REG  xmm6, xmm7
+    mov       eax,  [esp+16]
+	mov       ebx,  [esp+20]
+    movzx     ecx,  byte [eax-1]
+	movd      xmm0, ecx
+    pmuludq   xmm0, xmm6
+
+	movzx     ecx,  byte [eax+ebx-1]
+	movd      xmm1, ecx
+    pmuludq   xmm1, xmm6
+%if 1
+    punpckldq xmm0, xmm1
+%else
+	unpcklps  xmm0,	xmm1
+%endif
+	lea       eax,	[eax+ebx*2]
+	movzx	  ecx,	byte [eax-1]
+	movd	  xmm2,	ecx
+    pmuludq   xmm2, xmm6
+
+	movzx	  ecx,	byte [eax+ebx-1]
+	movd	  xmm3,	ecx
+    pmuludq   xmm3, xmm6
+%if 1
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+%else
+	unpcklps  xmm2,	xmm3
+	unpcklpd  xmm0,	xmm2
+%endif
+	movdqa	  [edx],xmm0
+
+	mov       eax, edi
+    mov       ebx, [esp+36]
+	mov       dword [ebx], 0x01
+
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+not_dc_h:
+    ; for V mode
+    mov       eax,  [esp+16]
+    sub		  eax,	[esp+20]
+	movd	  xmm0,	[eax]
+	pshufd	  xmm0,	xmm0, 0
+	movdqa	  [edx],xmm0
+
+	mov       eax, esi
+    mov       ebx, [esp+36]
+	mov       dword [ebx], 0x00
+
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+%endif
+
--- /dev/null
+++ b/codec/processing/src/asm/sad.asm
@@ -1,0 +1,220 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSad8x8_sse21
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [r0]
+	movq   xmm1,   [r0+r1]
+	lea    r0,     [r0+2*r1]
+	movhps xmm0,   [r0]
+	movhps xmm1,   [r0+r1]
+
+	movq   xmm2,   [r2]
+	movq   xmm3,   [r2+r3]
+	lea    r2,     [r2+2*r3]
+	movhps xmm2,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    ;mov    ecx,    [esp+12]
+	;mov    edx,    ecx
+    ;CACHE_SPLIT_CHECK edx, 8, 64
+	;jle    near   .pixel_sad_8x8_nsplit
+	;push   ebx
+	;push   edi
+	;mov    eax,    [esp+12]
+	;mov    ebx,    [esp+16]
+	
+	%assign  push_num 0
+	mov		r2,  arg3
+	push	r2
+	CACHE_SPLIT_CHECK r2, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	pop		r2
+%ifdef X86_32	
+	push	r3
+	push	r4
+	push	r5
+%endif
+	%assign  push_num 3
+	mov		r0,  arg1
+	mov		r1,  arg2	
+	SIGN_EXTENTION r1, r1d
+    pxor   xmm7,   xmm7
+    
+    ;ecx r2, edx r4, edi r5
+
+    mov    r5,    r2
+    and    r5,    0x07
+    sub    r2,    r5
+    mov    r4,    8
+    sub    r4,    r5
+
+    shl    r5,    3
+    shl    r4,    3
+    movd   xmm5,   r5d
+    movd   xmm6,   r4d
+	mov    r5,    8
+	add    r5,    r2
+    mov    r3,    arg4
+	SIGN_EXTENTION r3, r3d
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       retrd,  xmm0
+%ifdef X86_32
+	pop	 r5
+	pop	 r4
+	pop	 r3
+%endif
+	jmp        .return
+	
+.pixel_sad_8x8_nsplit:
+    ;push   ebx
+    ;mov    eax,    [esp+8]
+	;mov    ebx,    [esp+12]
+	;mov    edx,    [esp+20]
+	
+	pop r2
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENTION r1, r1d
+	SIGN_EXTENTION r3, r3d	
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	LOAD_4_PARA_POP
+.return:
+	ret
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/src/asm/vaa.asm
@@ -1,0 +1,1414 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+%ifdef X86_32
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
+;	movdqa %1, %2
+;	punpcklbw %1, %3
+;	punpckhbw %2, %3
+;	paddw %1, %2
+;	pmaddwd %1, %4
+;	pshufd %2, %1, 04Eh	; 01001110 B
+;	paddd %1, %2
+;	pshufd %2, %1, 0B1h	; 10110001 B
+;	paddd %1, %2
+;%endmacro	; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
+	movdqa %1, %2
+	punpcklbw %1, %3
+	punpckhbw %2, %3
+	pmaddwd %1, %1
+	pmaddwd %2, %2
+	paddd %1, %2
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddd %1, %2
+	pshufd %2, %1, 0B1h	; 10110001 B
+	paddd %1, %2
+%endmacro	; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2  0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	[esi+ebx]
+	movdqa	xmm4,	[edi+ebx]
+	psadbw	xmm1,	xmm2
+	psadbw	xmm3,	xmm4
+	paddd	xmm6,	xmm1
+	paddd	xmm6,	xmm3
+	lea		esi,	[esi+ebx*2]
+	lea		edi,	[edi+ebx*2]
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm6,	xmm3
+
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm0
+	paddd	xmm5,	xmm3
+
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm2
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm7,	xmm3	; sad
+
+	movdqa	xmm3,	xmm1
+	pmaxub	xmm3,	xmm2
+	pminub	xmm2,	xmm1
+	psubb	xmm3,	xmm2	; diff
+
+	movdqa	xmm2,	xmm1
+	psadbw	xmm2,	xmm0
+	paddd	xmm6,	xmm2	; sum
+
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm5,	xmm1
+	paddd		xmm5,	xmm2	; sqsum
+
+	movdqa		xmm1,	xmm3
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm3	; sqdiff
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
+%define sad_reg			%1
+%define	sum_cur_reg		%2
+%define sum_ref_reg		%3
+%define	mad_reg			%4
+	movdqa	xmm1,		[esi]
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_cur_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	paddd	sum_ref_reg,			xmm3	; sum_ref
+
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	add			esi,		ebx
+	add			edi,		ebx
+%endmacro
+
+
+%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
+%define max_reg  %1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		4
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		2
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		1
+	pmaxub	max_reg,	xmm1
+%endmacro
+
+%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
+%define sad_reg		%1
+%define	sum_reg		%2
+%define mad_reg		%3
+%define sqdiff_reg	%4
+	movdqa		xmm1,		[esi]
+	movdqa		xmm2,		xmm1
+	movdqa		xmm3,		xmm1
+	punpcklbw	xmm2,		xmm0
+	punpckhbw	xmm3,		xmm0
+	pmaddwd		xmm2,		xmm2
+	pmaddwd		xmm3,		xmm3
+	paddd		xmm2,		xmm3
+	movdqa		xmm3,		xmm2
+	psllq		xmm2,		32
+	psrlq		xmm3,		32
+	psllq		xmm3,		32
+	paddd		xmm2,		xmm3
+	paddd		sad_reg,	xmm2		; sqsum
+
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	pslldq	xmm3,		4
+	paddd	sum_reg,			xmm3	; sum_ref
+
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+
+	movdqa	xmm1,		xmm3
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	movdqa		xmm3,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		sqdiff_reg,	xmm1
+	paddd		sqdiff_reg,	xmm3	; sqdiff
+
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+	push esi
+	push edi
+	push ebp
+	push ebx
+	push edx
+
+	mov esi, [esp+24]
+	mov edi, [esp+28]
+	mov ebx, [esp+32]
+	mov ecx, [esp+36]
+	mov edx, [esp+40]
+	pxor xmm0, xmm0
+.hloop:
+	mov eax, ebx
+	mov ebp, $0
+.wloop:
+	movdqa xmm1, [esi+ebp]
+	movdqa xmm2, [edi+ebp]
+	psadbw xmm1, xmm2
+	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
+	paddd xmm1, xmm2
+	paddd xmm0, xmm1
+	add ebp, 010h
+	dec eax
+	jnz near .wloop
+	lea esi, [esi+edx]
+	lea edi, [edi+edx]
+	dec ecx
+	jnz near .hloop
+
+	movd eax, xmm0
+	pop edx
+	pop ebx
+	pop ebp
+	pop edi
+	pop esi
+	ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:
+	push esi
+	push edi
+	push ebx
+
+	sub esp, 16
+	%define SUM			[esp]
+	%define SUM_CUR		[esp+4]
+	%define SQR			[esp+8]
+	%define SQR_CUR		[esp+12]
+	%define PUSH_SIZE	28	; 12 + 16
+
+	mov edi, [esp+PUSH_SIZE+4]	; y_ref
+	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride
+	mov esi, [esp+PUSH_SIZE+12]	; y_src
+	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
+	mov ecx, 010h				; height = 16
+
+	pxor xmm7, xmm7
+	movdqu SUM, xmm7
+
+.hloops:
+	movdqa xmm0, [edi]		; y_ref
+	movdqa xmm1, [esi]		; y_src
+	movdqa xmm2, xmm0		; store first for future process
+	movdqa xmm3, xmm1
+	; sum += diff;
+	movdqa xmm4, xmm0
+	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
+	; to be continued for sum
+	pshufd xmm5, xmm4, 0C6h	; 11000110 B
+	paddw xmm4, xmm5
+	movd ebx, xmm4
+	add SUM, ebx
+
+	; sqr += diff * diff;
+	pmaxub xmm0, xmm1
+	pminub xmm1, xmm2
+	psubb xmm0, xmm1				; diff
+	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
+	movd ebx, xmm1
+	add SQR, ebx
+
+	; sum_cur += y_src[x];
+	movdqa xmm0, xmm3		; cur_orig
+	movdqa xmm1, xmm0
+	punpcklbw xmm0, xmm7
+	punpckhbw xmm1, xmm7
+	paddw xmm0, xmm1		; 8x2
+	SUM_WORD_8x2_SSE2 xmm0, xmm1
+	movd ebx, xmm0
+	and ebx, 0ffffh
+	add SUM_CUR, ebx
+
+	; sqr_cur += y_src[x] * y_src[x];
+	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
+	movd ebx, xmm0
+	add SQR_CUR, ebx
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
+	dec ecx
+	jnz near .hloops
+
+	mov ebx, 0
+	mov bx, word SUM
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR
+	sar ecx, 8
+	sub ecx, ebx
+	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
+	mov [edi], cx				; to store uiMotionIndex
+	mov ebx, 0
+	mov bx, word SUM_CUR
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR_CUR
+	sar ecx, 8
+	sub ecx, ebx
+	mov [edi+2], cx				; to store uiTextureIndex
+
+	%undef SUM
+	%undef SUM_CUR
+	%undef SQR
+	%undef SQR_CUR
+	%undef PUSH_SIZE
+
+	add esp, 16
+	pop ebx
+	pop edi
+	pop esi
+
+	ret
+
+; , 6/7/2010
+
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+;								 int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define		ref_orig			esp + pushsize + 4
+%define		cur_orig			esp + pushsize + 8
+%define		iPicStride			esp + pushsize + 12
+%define		gom_pixel_num		esp + pushsize + 16
+%define		pSum				esp + pushsize + 20
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[ref_orig]
+	mov		edi,	[cur_orig]
+	mov		ebx,	[iPicStride]
+	mov		eax,	[gom_pixel_num]
+	mov		ecx,	16					;MB_WIDTH_LUMA
+	pxor	xmm0,	xmm0
+mb_width_loop_p:
+	mov		edx,	esi
+	add		edx,	eax			; end address
+gom_row_loop_p:
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	psadbw	xmm1,	xmm2
+	paddd	xmm0,	xmm1
+	add		esi,	16
+	add		edi,	16
+	cmp		esi,	edx
+	jl		gom_row_loop_p
+
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	ebx
+	add		edi,	ebx
+	loop	mb_width_loop_p
+
+	movdqa	xmm1,	xmm0
+	psrldq	xmm1,	8
+	paddd	xmm1,	xmm0
+	movd	eax,	xmm1
+	mov		edx,	[pSum]	; pSum
+	add		[edx],	eax
+
+%undef		ref_orig
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pushsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define		cur_orig			esp + pushsize + 4
+%define		iPicStride			esp + pushsize + 8
+%define		gom_pixel_num		esp + pushsize + 12
+%define		pSum				esp + pushsize + 16
+%define		pSqrSum				esp + pushsize + 20
+%define		pushsize			8
+	push		esi
+	push		ebx
+	mov			esi,	[cur_orig]
+	mov			eax,	[gom_pixel_num]
+	mov			ebx,	[iPicStride]
+	mov			ecx,	16					;MB_WIDTH_LUMA
+	pxor		xmm0,	xmm0				; zero
+	pxor		xmm1,	xmm1				; sum
+	pxor		xmm2,	xmm2				; sqr sum
+mb_width_loop_i:
+	mov			edx,	esi
+	add			edx,	eax			; end address
+gom_row_loop_i:
+	movdqa		xmm3,	[esi]
+	movdqa		xmm4,	xmm3
+	psadbw		xmm4,	xmm0
+	paddd		xmm1,	xmm4
+	movdqa		xmm4,	xmm3
+	punpcklbw	xmm4,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm4,	xmm4
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm2,	xmm3
+	paddd		xmm2,	xmm4
+	add			esi,	16
+	cmp			esi,	edx
+	jl			gom_row_loop_i
+
+	sub			esi,	eax
+	add			esi,	ebx
+	loop		mb_width_loop_i
+
+	movdqa		xmm3,	xmm1
+	psrldq		xmm3,	8
+	paddd		xmm1,	xmm3
+	movd		eax,	xmm1
+	mov			edx,	[pSum]
+	add			[edx],	eax
+
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	8
+	paddd		xmm2,	xmm3
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	4
+	paddd		xmm2,	xmm3
+	movd		eax,	xmm2
+	mov			edx,	[pSqrSum]
+	add			[edx],	eax
+
+
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pSqrSum
+%undef		pushsize
+	pop			ebx
+	pop			esi
+	ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define		cur_data			esp + pushsize + 4
+%define		ref_data			esp + pushsize + 8
+%define		iPicWidth			esp + pushsize + 12
+%define		iPicHeight			esp + pushsize + 16
+%define		iPicStride			esp + pushsize + 20
+%define		psadframe			esp + pushsize + 24
+%define		psad8x8				esp + pushsize + 28
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4								; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+height_loop:
+	mov		ecx,	dword [iPicWidth]
+	push	esi
+	push	edi
+width_loop:
+	pxor	xmm6,	xmm6		;
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+
+	pxor	xmm6,	xmm6
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		width_loop
+
+	pop		edi
+	pop		esi
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		height_loop
+
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		pushsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define		localsize		8
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+var_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+var_width_loop:
+	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
+	pxor	xmm5,	xmm5		; pSum16x16
+	pxor	xmm4,	xmm4		; sqsum_16x16
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+
+	pxor	xmm6,	xmm6
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm5
+	psrldq	xmm1,	8
+	paddd	xmm5,	xmm1
+	movd	[ebp],	xmm5
+	add		dword [psum16x16], 4
+
+	movdqa	xmm5,	xmm4
+	psrldq	xmm5,	8
+	paddd	xmm4,	xmm5
+	movdqa	xmm3,	xmm4
+	psrldq	xmm3,	4
+	paddd	xmm4,	xmm3
+
+	mov		ebp,	[psqsum16x16]
+	movd	[ebp],	xmm4
+	add		dword [psqsum16x16], 4
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		var_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		var_height_loop
+
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_width_loop:
+	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
+	pxor	xmm6,	xmm6		; pSum16x16
+	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx],		xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+4],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+
+	pxor	xmm7,	xmm7
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx+8],	xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+12],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm6
+	psrldq	xmm1,	8
+	paddd	xmm6,	xmm1
+	movd	[ebp],	xmm6
+	add		dword [psum16x16], 4
+
+	mov		ebp,	[psqsum16x16]
+	pshufd	xmm6,	xmm5,	14 ;00001110
+	paddd	xmm6,	xmm5
+	pshufd	xmm5,	xmm6,	1  ;00000001
+	paddd	xmm5,	xmm6
+	movd	[ebp],	xmm5
+	add		dword [psqsum16x16], 4
+
+	mov		ebp,	[psqdiff16x16]
+	pshufd	xmm5,	xmm4,	14	; 00001110
+	paddd	xmm5,	xmm4
+	pshufd	xmm4,	xmm5,	1	; 00000001
+	paddd	xmm4,	xmm5
+	movd	[ebp],	xmm4
+	add		dword	[psqdiff16x16],	4
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	dec		ecx
+	jnz		sqdiff_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		sqdiff_height_loop
+
+	mov		ebx,	[tmp_sadframe]
+	mov		eax,	[psadframe]
+	mov		[eax],	ebx
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		tmp_sadframe
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		p_sd8x8				esp + pushsize + localsize + 32
+%define		p_mad8x8			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_ecx				esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	xor		ebp,	ebp
+	pxor	xmm0,	xmm0
+bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8
+	pxor	xmm6,	xmm6		; sum_cur_8x8
+	pxor	xmm5,	xmm5		; sum_ref_8x8
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+
+	pslldq		xmm7,	4
+	pslldq		xmm6,	4
+	pslldq		xmm5,	4
+
+
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+
+	mov		edx,	[psad8x8]
+	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[psad8x8],	edx					; sad8x8
+
+	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
+	pshufd	xmm2,	xmm1,	00000011b
+	paddd	xmm1,	xmm2
+	movd	edx,	xmm1
+	add		ebp,	edx						; sad frame
+
+	mov		edx,	[p_sd8x8]
+	psubd	xmm6,	xmm5
+	pshufd	xmm1,	xmm6,	10001101b
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[p_sd8x8],	edx
+
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		bgd_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec		dword [iPicHeight]
+	jnz		bgd_height_loop
+
+	mov		edx,	[psadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define		localsize		16
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		p_sd8x8				esp + pushsize + localsize + 44
+%define		p_mad8x8			esp + pushsize + localsize + 48
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		tmp_ecx				esp + 12
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+
+	mov		edx,		[psum16x16]
+	movdqa	xmm1,		xmm6
+	pshufd	xmm2,		xmm1,		00001110b
+	paddd	xmm1,		xmm2
+	movd	[edx],		xmm1				; sum
+
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	psrlq	xmm7,	32
+	psllq	xmm7,	32			; clear sad
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+
+	mov		edx,			[psum16x16]
+	movdqa	xmm1,			xmm6
+	pshufd	xmm2,			xmm1,		00001110b
+	paddd	xmm1,			xmm2
+	movd	ebp,			xmm1				; sum
+	add		[edx],			ebp
+	add		edx,			4
+	mov		[psum16x16],	edx
+
+	mov		edx,			[psqsum16x16]
+	psrlq	xmm7,			32
+	pshufd	xmm2,			xmm7,		00001110b
+	paddd	xmm2,			xmm7
+	movd	[edx],			xmm2				; sqsum
+	add		edx,			4
+	mov		[psqsum16x16],	edx
+
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+
+	mov		edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	mov		edx,		[psqdiff16x16]
+	pshufd	xmm1,		xmm4,		00001110b
+	paddd	xmm4,		xmm1
+	pshufd	xmm1,		xmm4,		00000001b
+	paddd	xmm4,		xmm1
+	movd	[edx],		xmm4
+	add		edx,		4
+	mov		[psqdiff16x16],	edx
+
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		sqdiff_bgd_width_loop
+
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+
+	dec	dword [iPicHeight]
+	jnz		sqdiff_bgd_height_loop
+
+	mov		edx,	[psadframe]
+	mov		ebp,	[tmp_sadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+%endif
--- /dev/null
+++ b/codec/processing/src/backgounddetection/BackgroundDetection.cpp
@@ -1,0 +1,389 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "BackgroundDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define LOG2_BGD_OU_SIZE    (4)
+#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
+#define BGD_OU_SIZE         (1<<LOG2_BGD_OU_SIZE)
+#define BGD_OU_SIZE_UV      (BGD_OU_SIZE>>1)
+#define BGD_THD_SAD         (2*BGD_OU_SIZE*BGD_OU_SIZE)
+#define	BGD_THD_ASD_UV      (4*BGD_OU_SIZE_UV)
+#define LOG2_MB_SIZE        (4)
+#define OU_SIZE_IN_MB       (BGD_OU_SIZE >> 4)
+#define Q_FACTOR            (8)
+#define BGD_DELTA_QP_THD    (3)
+
+#define OU_LEFT		(0x01)
+#define OU_RIGHT	(0x02)
+#define OU_TOP		(0x04)
+#define OU_BOTTOM	(0x08)
+
+CBackgroundDetection::CBackgroundDetection (int32_t iCpuFlag) {
+  m_eMethod = METHOD_BACKGROUND_DETECTION;
+  WelsMemset (&m_BgdParam, 0, sizeof (m_BgdParam));
+  m_iLargestFrameSize = 0;
+}
+
+CBackgroundDetection::~CBackgroundDetection() {
+  FreeOUArrayMemory();
+}
+
+EResult CBackgroundDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  if (pSrcPixMap == NULL || pRefPixMap == NULL)
+    return eReturn;
+
+  m_BgdParam.pCur[0] = (uint8_t*)pSrcPixMap->pPixel[0];
+  m_BgdParam.pCur[1] = (uint8_t*)pSrcPixMap->pPixel[1];
+  m_BgdParam.pCur[2] = (uint8_t*)pSrcPixMap->pPixel[2];
+  m_BgdParam.pRef[0] = (uint8_t*)pRefPixMap->pPixel[0];
+  m_BgdParam.pRef[1] = (uint8_t*)pRefPixMap->pPixel[1];
+  m_BgdParam.pRef[2] = (uint8_t*)pRefPixMap->pPixel[2];
+  m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
+  m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
+  m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
+  m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
+  m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
+
+  int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
+  if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize) {
+    FreeOUArrayMemory();
+    m_BgdParam.pOU_array = AllocateOUArrayMemory (m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
+    m_iLargestFrameSize = iCurFrameSize;
+  }
+
+  if (m_BgdParam.pOU_array == NULL)
+    return eReturn;
+
+  BackgroundDetection (&m_BgdParam);
+
+  return RET_SUCCESS;
+}
+
+EResult CBackgroundDetection::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  SBGDInterface* pInterface = (SBGDInterface*)pParam;
+
+  m_BgdParam.pBackgroundMbFlag = (int8_t*)pInterface->pBackgroundMbFlag;
+  m_BgdParam.pCalcRes = pInterface->pCalcRes;
+
+  return RET_SUCCESS;
+}
+
+inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight) {
+  int32_t	iMaxOUWidth	= (BGD_OU_SIZE - 1 + iWidth) >> LOG2_BGD_OU_SIZE;
+  int32_t	iMaxOUHeight	= (BGD_OU_SIZE - 1 + iHeight) >> LOG2_BGD_OU_SIZE;
+  return (SBackgroundOU*)WelsMalloc (iMaxOUWidth * iMaxOUHeight * sizeof (SBackgroundOU));
+}
+
+inline void CBackgroundDetection::FreeOUArrayMemory() {
+  _SafeFree (m_BgdParam.pOU_array);
+}
+
+void CBackgroundDetection::GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
+    SBackgroundOU* pBgdOU) {
+  int32_t	iSubSD[4];
+  uint8_t	iSubMAD[4];
+  int32_t	iSubSAD[4];
+
+  uint8_t (*pMad8x8)[4];
+  int32_t (*pSad8x8)[4];
+  int32_t (*pSd8x8)[4];
+
+  pSad8x8 = sVaaCalcInfo->pSad8x8;
+  pMad8x8 = sVaaCalcInfo->pMad8x8;
+  pSd8x8  = sVaaCalcInfo->pSumOfDiff8x8;
+
+  iSubSAD[0] = pSad8x8[iMbIndex][0];
+  iSubSAD[1] = pSad8x8[iMbIndex][1];
+  iSubSAD[2] = pSad8x8[iMbIndex][2];
+  iSubSAD[3] = pSad8x8[iMbIndex][3];
+
+  iSubSD[0] = pSd8x8[iMbIndex][0];
+  iSubSD[1] = pSd8x8[iMbIndex][1];
+  iSubSD[2] = pSd8x8[iMbIndex][2];
+  iSubSD[3] = pSd8x8[iMbIndex][3];
+
+  iSubMAD[0] = pMad8x8[iMbIndex][0];
+  iSubMAD[1] = pMad8x8[iMbIndex][1];
+  iSubMAD[2] = pMad8x8[iMbIndex][2];
+  iSubMAD[3] = pMad8x8[iMbIndex][3];
+
+  pBgdOU->iSD	= iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
+  pBgdOU->iSAD	= iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
+  pBgdOU->iSD	= WELS_ABS (pBgdOU->iSD);
+
+  // get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
+  pBgdOU->iMAD = WELS_MAX (WELS_MAX (iSubMAD[0], iSubMAD[1]), WELS_MAX (iSubMAD[2], iSubMAD[3]));
+  pBgdOU->iMinSubMad = WELS_MIN (WELS_MIN (iSubMAD[0], iSubMAD[1]), WELS_MIN (iSubMAD[2], iSubMAD[3]));
+
+  // get difference between the max and min SD of the SDs of sub-blocks of OU
+  pBgdOU->iMaxDiffSubSd = WELS_MAX (WELS_MAX (iSubSD[0], iSubSD[1]), WELS_MAX (iSubSD[2], iSubSD[3])) -
+                          WELS_MIN (WELS_MIN (iSubSD[0], iSubSD[1]), WELS_MIN (iSubSD[2], iSubSD[3]));
+}
+
+void CBackgroundDetection::ForegroundBackgroundDivision (vBGDParam* pBgdParam) {
+  int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
+  int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+  int32_t iPicWidthInMb	= (15 + pBgdParam->iBgdWidth) >> 4;
+
+  SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
+
+  for (int32_t j = 0; j < iPicHeightInOU; j ++) {
+    for (int32_t i = 0; i < iPicWidthInOU; i++) {
+      GetOUParameters (pBgdParam->pCalcRes, (j * iPicWidthInMb + i) << (LOG2_BGD_OU_SIZE - LOG2_MB_SIZE), iPicWidthInMb,
+                       pBackgroundOU);
+
+      pBackgroundOU->iBackgroundFlag = 0;
+      if (pBackgroundOU->iMAD > 63) {
+        pBackgroundOU++;
+        continue;
+      }
+      if ((pBackgroundOU->iMaxDiffSubSd <= pBackgroundOU->iSAD >> 3
+           || pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR))
+          && pBackgroundOU->iSAD < (BGD_THD_SAD << 1)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
+        if (pBackgroundOU->iSAD <= BGD_OU_SIZE * Q_FACTOR) {
+          pBackgroundOU->iBackgroundFlag = 1;
+        } else {
+          pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
+                                           (pBackgroundOU->iSD < (pBackgroundOU->iSAD * 3) >> 2) :
+                                           (pBackgroundOU->iSD << 1 < pBackgroundOU->iSAD);
+        }
+      }
+      pBackgroundOU++;
+    }
+  }
+}
+inline int32_t CBackgroundDetection::CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride) {
+  int32_t	ASD = 0;
+  int32_t	idx;
+  for (idx = 0; idx < BGD_OU_SIZE_UV; idx++) {
+    ASD += *pOriCur - *pOriRef;
+    pOriRef += iStride;
+    pOriCur += iStride;
+  }
+  return WELS_ABS (ASD);
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
+    SBackgroundOU* pOUNeighbours[]) {
+  SBackgroundOU* pOU_L	= pOUNeighbours[0];
+  SBackgroundOU* pOU_R	= pOUNeighbours[1];
+  SBackgroundOU* pOU_U	= pOUNeighbours[2];
+  SBackgroundOU* pOU_D	= pOUNeighbours[3];
+
+  if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad << 1) {
+    int32_t iMaxNbrForegroundMad;
+    int32_t iMaxNbrBackgroundMad;
+    int32_t	aBackgroundMad[4];
+    int32_t	aForegroundMad[4];
+
+    aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
+    aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
+    aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
+    aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
+    iMaxNbrForegroundMad = WELS_MAX (WELS_MAX (aForegroundMad[0], aForegroundMad[1]), WELS_MAX (aForegroundMad[2],
+                                     aForegroundMad[3]));
+
+    aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
+    aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
+    aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
+    aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
+    iMaxNbrBackgroundMad = WELS_MAX (WELS_MAX (aBackgroundMad[0], aBackgroundMad[1]), WELS_MAX (aBackgroundMad[2],
+                                     aBackgroundMad[3]));
+
+    return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad << 2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad << 1
+            && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad * 3) >> 1));
+  }
+  return 0;
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags,
+    int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam* pBgdParam) {
+  static const int8_t kaOUPos[4]	= {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
+  int32_t	aEdgeOffset[4]	= {0, BGD_OU_SIZE_UV - 1, 0, iPicStrideUV* (BGD_OU_SIZE_UV - 1)};
+  int32_t	iStride[4]		= {iPicStrideUV, iPicStrideUV, 1, 1};
+
+  // V component first, high probability because V stands for red color and human skin colors have more weight on this component
+  for (int32_t i = 0; i < 4; i++) {
+    if (iNeighbourForegroundFlags & kaOUPos[i]) {
+      uint8_t* pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
+      uint8_t* pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
+      if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
+        return 1;
+      }
+    }
+  }
+  // U component, which stands for blue color, low probability
+  for (int32_t i = 0; i < 4; i++) {
+    if (iNeighbourForegroundFlags & kaOUPos[i]) {
+      uint8_t* pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
+      uint8_t* pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
+      if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
+        return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+inline void CBackgroundDetection::ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[],
+    vBGDParam* pBgdParam, int32_t	iChromaSampleStartPos) {
+  int32_t iPicStrideUV	= pBgdParam->iStride[1];
+  int32_t iSumNeighBackgroundFlags	= pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
+                                      pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+
+  if (pBackgroundOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
+    switch (iSumNeighBackgroundFlags) {
+    case 0:
+    case 1:
+      pBackgroundOU->iBackgroundFlag = 0;
+      break;
+    case 2:
+    case 3:
+      pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
+
+      // chroma component check
+      if (pBackgroundOU->iBackgroundFlag == 1) {
+        int8_t	iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag) << 1)
+                                            | ((!pOUNeighbours[2]->iBackgroundFlag) << 2) | ((!pOUNeighbours[3]->iBackgroundFlag) << 3);
+        pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma (iNeighbourForegroundFlags, iChromaSampleStartPos,
+                                         iPicStrideUV, pBgdParam);
+      }
+      break;
+    default:
+      break;
+    }
+  }
+}
+inline void CBackgroundDetection::BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]) {
+  if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
+    int32_t	iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
+                                       pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+    int32_t	sumNbrBGsad = (pOUNeighbours[0]->iSAD & (-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD &
+                          (-pOUNeighbours[2]->iBackgroundFlag))
+                          + (pOUNeighbours[1]->iSAD & (-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD &
+                              (-pOUNeighbours[3]->iBackgroundFlag));
+    if (pBackgroundOU->iSAD * iSumNeighBackgroundFlags <= (3 * sumNbrBGsad) >> 1) {
+      if (iSumNeighBackgroundFlags == 4) {
+        pBackgroundOU->iBackgroundFlag = 1;
+      } else {
+        if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag)
+            || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag)) {
+          pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
+        }
+      }
+    }
+  }
+}
+
+inline void CBackgroundDetection::SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb,
+    int32_t iBackgroundMbFlag) {
+  *pBackgroundMbFlag = iBackgroundMbFlag;
+}
+
+inline void CBackgroundDetection::UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag,
+    int32_t iPicWidthInOU, int32_t iPicWidthInMb) {
+  if (pCurOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
+    SBackgroundOU*	pOU_L = pCurOU - 1;
+    SBackgroundOU*	pOU_R = pCurOU + 1;
+    SBackgroundOU*	pOU_U = pCurOU - iPicWidthInOU;
+    SBackgroundOU*	pOU_D = pCurOU + iPicWidthInOU;
+    if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1) {
+      SetBackgroundMbFlag (pBackgroundMbFlag, iPicWidthInMb, 0);
+      pCurOU->iBackgroundFlag = 0;
+    }
+  }
+}
+
+void CBackgroundDetection::ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam) {
+  int32_t iPicStrideUV		= pBgdParam->iStride[1];
+  int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
+  int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+  int32_t iOUStrideUV		= iPicStrideUV << (LOG2_BGD_OU_SIZE - 1);
+  int32_t iPicWidthInMb	= (15 + pBgdParam->iBgdWidth) >> 4;
+
+  SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
+  int8_t*	pVaaBackgroundMbFlag   = (int8_t*)pBgdParam->pBackgroundMbFlag;
+  SBackgroundOU*	pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
+
+  pBackgroundOU	= pBgdParam->pOU_array;
+  pOUNeighbours[2]	= pBackgroundOU;//top OU
+  for (int32_t j = 0; j < iPicHeightInOU; j ++) {
+    int8_t* pRowSkipFlag = pVaaBackgroundMbFlag;
+    pOUNeighbours[0]	= pBackgroundOU;//left OU
+    pOUNeighbours[3]	= pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU - 1) - 1)); //bottom OU
+    for (int32_t i = 0; i < iPicWidthInOU; i++) {
+      pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU - 1); //right OU
+
+      if (pBackgroundOU->iBackgroundFlag)
+        ForegroundDilation (pBackgroundOU, pOUNeighbours, pBgdParam, j * iOUStrideUV + (i << LOG2_BGD_OU_SIZE_UV));
+      else
+        BackgroundErosion (pBackgroundOU, pOUNeighbours);
+
+      // check the up OU
+      if (j > 1 && i > 0 && i < iPicWidthInOU - 1 && pOUNeighbours[2]->iBackgroundFlag == 1) {
+        UpperOUForegroundCheck (pOUNeighbours[2], pRowSkipFlag - OU_SIZE_IN_MB * iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
+      }
+
+      SetBackgroundMbFlag (pRowSkipFlag, iPicWidthInMb, pBackgroundOU->iBackgroundFlag);
+
+      // preparation for the next OU
+      pRowSkipFlag += OU_SIZE_IN_MB;
+      pOUNeighbours[0] = pBackgroundOU;
+      pOUNeighbours[2]++;
+      pOUNeighbours[3]++;
+      pBackgroundOU++;
+    }
+    pOUNeighbours[2]	= pBackgroundOU - iPicWidthInOU;
+    pVaaBackgroundMbFlag += OU_SIZE_IN_MB * iPicWidthInMb;
+  }
+}
+
+void CBackgroundDetection::BackgroundDetection (vBGDParam* pBgdParam) {
+  // 1st step: foreground/background coarse division
+  ForegroundBackgroundDivision (pBgdParam);
+
+  // 2nd step: foreground dilation and background erosion
+  ForegroundDilationAndBackgroundErosion (pBgdParam);
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/backgounddetection/BackgroundDetection.h
@@ -1,0 +1,106 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	       :  BackgroundDetection.h
+ *
+ * \brief	     :  background detection class of wels video processor class
+ *
+ * \date        :  2011/03/17
+ *
+ * \description :  1. rewrite the package code of background detection class
+ *
+ */
+
+#ifndef WELSVP_BACKGROUNDDETECTION_H
+#define WELSVP_BACKGROUNDDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef struct {
+  int32_t	iBackgroundFlag;
+  int32_t	iSAD;
+  int32_t	iSD;
+  int32_t	iMAD;
+  int32_t	iMinSubMad;
+  int32_t	iMaxDiffSubSd;
+} SBackgroundOU;
+
+class CBackgroundDetection : public IStrategy {
+ public:
+  CBackgroundDetection (int32_t iCpuFlag);
+  ~CBackgroundDetection();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Set (int32_t iType, void* pParam);
+
+ private:
+  struct vBGDParam {
+    uint8_t*   pCur[3];
+    uint8_t*   pRef[3];
+    int32_t	   iBgdWidth;
+    int32_t	   iBgdHeight;
+    int32_t    iStride[3];
+    SBackgroundOU*  	pOU_array;
+    int8_t*  	pBackgroundMbFlag;
+    SVAACalcResult*  pCalcRes;
+  } m_BgdParam;
+
+  int32_t     m_iLargestFrameSize;
+
+ private:
+  inline SBackgroundOU* AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight);
+  inline void     FreeOUArrayMemory();
+  inline int32_t  CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride);
+  inline bool_t   ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
+      SBackgroundOU* pOUNeighbours[]); //Foreground_Dilation_2_3_Luma
+  inline bool_t   ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos,
+      int32_t iPicStrideUV, vBGDParam* pBgdParam);//Foreground_Dilation_2_3_Chroma
+  inline void     ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[], vBGDParam* pBgdParam,
+                                      int32_t	iChromaSampleStartPos);
+  inline void     BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]);
+  inline void     SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
+  inline void     UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag, int32_t iPicWidthInOU,
+                                          int32_t iPicWidthInMb);
+
+  void    GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
+                           SBackgroundOU* pBackgroundOU);
+  void    ForegroundBackgroundDivision (vBGDParam* pBgdParam);
+  void    ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam);
+  void    BackgroundDetection (vBGDParam* pBgdParam);
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/WelsFrameWork.cpp
@@ -1,0 +1,301 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+#include "cpu.h"
+#include "../denoise/denoise.h"
+#include "../downsample/downsample.h"
+#include "../scenechangedetection/SceneChangeDetection.h"
+#include "../vaacalc/vaacalculation.h"
+#include "../backgounddetection/BackgroundDetection.h"
+#include "../adaptivequantization/AdaptiveQuantization.h"
+#include "../complexityanalysis/ComplexityAnalysis.h"
+#include "../imagerotate/imagerotate.h"
+
+
+/* interface API implement */
+
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion) {
+  if (iVersion & 0x8000)
+    return nsWelsVP::CreateSpecificVpInterface ((IWelsVP**)ppCtx);
+  else if (iVersion & 0x7fff)
+    return nsWelsVP::CreateSpecificVpInterface ((IWelsVPc**)ppCtx);
+  else
+    return RET_INVALIDPARAM;
+}
+
+EResult WELSAPI DestroyVpInterface (void* pCtx, int iVersion) {
+  if (iVersion & 0x8000)
+    return nsWelsVP::DestroySpecificVpInterface ((IWelsVP*)pCtx);
+  else if (iVersion & 0x7fff)
+    return nsWelsVP::DestroySpecificVpInterface ((IWelsVPc*)pCtx);
+  else
+    return RET_INVALIDPARAM;
+}
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface (IWelsVP** ppCtx) {
+  EResult  eReturn = RET_FAILED;
+
+  CVpFrameWork* pFr = new CVpFrameWork (1, eReturn);
+  if (pFr) {
+    *ppCtx  = (IWelsVP*)pFr;
+    eReturn = RET_SUCCESS;
+  }
+
+  return eReturn;
+}
+
+EResult DestroySpecificVpInterface (IWelsVP* pCtx) {
+  _SafeDelete (pCtx);
+
+  return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+CVpFrameWork::CVpFrameWork (uint32_t uiThreadsNum, EResult& eReturn) {
+  int32_t iCoreNum = 1;
+#ifndef X86_ASM
+  uint32_t uiCPUFlag = 0;
+#else
+  uint32_t uiCPUFlag = WelsCPUFeatureDetect (&iCoreNum);
+#endif
+
+  for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
+    IStrategy* pStrategy = m_pStgChain[i];
+    pStrategy = CreateStrategy (WelsStaticCast (EMethods, i + 1), uiCPUFlag);
+    m_pStgChain[i] = pStrategy;
+  }
+
+  WelsMutexInit (&m_mutes);
+
+  eReturn = RET_SUCCESS;
+}
+
+CVpFrameWork::~CVpFrameWork() {
+  for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
+    if (m_pStgChain[i]) {
+      Uninit (m_pStgChain[i]->m_eMethod);
+      _SafeDelete (m_pStgChain[i]);
+    }
+  }
+
+  WelsMutexDestroy (&m_mutes);
+}
+
+EResult CVpFrameWork::Init (int32_t iType, void* pCfg) {
+  EResult eReturn   = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  Uninit (iType);
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Init (0, pCfg);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Uninit (int32_t iType) {
+  EResult eReturn        = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Uninit (0);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Flush (int32_t iType) {
+  EResult eReturn        = RET_SUCCESS;
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
+  EResult eReturn        = RET_NOTSUPPORTED;
+  EMethods eMethod    = WelsVpGetValidMethod (iType);
+  int32_t iCurIdx    = WelsStaticCast (int32_t, eMethod) - 1;
+  SPixMap sSrcPic;
+  SPixMap sDstPic;
+  memset (&sSrcPic, 0, sizeof (sSrcPic)); // confirmed_safe_unsafe_usage
+  memset (&sDstPic, 0, sizeof (sDstPic)); // confirmed_safe_unsafe_usage
+
+  if (pSrcPixMap) sSrcPic = *pSrcPixMap;
+  if (pDstPixMap) sDstPic = *pDstPixMap;
+  if (!CheckValid (eMethod, sSrcPic, sDstPic))
+    return RET_INVALIDPARAM;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Process (0, &sSrcPic, &sDstPic);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Get (int32_t iType, void* pParam) {
+  EResult eReturn        = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  if (!pParam)
+    return RET_INVALIDPARAM;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Get (0, pParam);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::Set (int32_t iType, void* pParam) {
+  EResult eReturn        = RET_SUCCESS;
+  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+  if (!pParam)
+    return RET_INVALIDPARAM;
+
+  WelsMutexLock (&m_mutes);
+
+  IStrategy* pStrategy = m_pStgChain[iCurIdx];
+  if (pStrategy)
+    eReturn = pStrategy->Set (0, pParam);
+
+  WelsMutexUnlock (&m_mutes);
+
+  return eReturn;
+}
+
+EResult CVpFrameWork::SpecialFeature (int32_t iType, void* pIn, void* pOut) {
+  EResult eReturn        = RET_SUCCESS;
+
+  return eReturn;
+}
+
+bool_t  CVpFrameWork::CheckValid (EMethods eMethod, SPixMap& pSrcPixMap, SPixMap& pDstPixMap) {
+  bool_t eReturn = FALSE;
+
+  if (eMethod == METHOD_NULL)
+    goto exit;
+
+  if (eMethod != METHOD_COLORSPACE_CONVERT) {
+    if (pSrcPixMap.pPixel[0]) {
+      if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
+        goto exit;
+    }
+    if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0]) {
+      if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
+        goto exit;
+    }
+  }
+
+  if (pSrcPixMap.pPixel[0]) {
+    if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0
+        || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
+      goto exit;
+    if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight
+        || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
+      goto exit;
+  }
+  if (pDstPixMap.pPixel[0]) {
+    if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0
+        || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
+      goto exit;
+    if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight
+        || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
+      goto exit;
+  }
+  eReturn = TRUE;
+
+exit:
+  return eReturn;
+}
+
+IStrategy* CVpFrameWork::CreateStrategy (EMethods m_eMethod, int32_t iCpuFlag) {
+  IStrategy* pStrategy = NULL;
+
+  switch (m_eMethod) {
+  case METHOD_COLORSPACE_CONVERT:
+    //not support yet
+    break;
+  case METHOD_DENOISE:
+    pStrategy = WelsDynamicCast (IStrategy*, new CDenoiser (iCpuFlag));
+    break;
+  case METHOD_SCENE_CHANGE_DETECTION:
+    pStrategy = WelsDynamicCast (IStrategy*, new CSceneChangeDetection (iCpuFlag));
+    break;
+  case METHOD_DOWNSAMPLE:
+    pStrategy = WelsDynamicCast (IStrategy*, new CDownsampling (iCpuFlag));
+    break;
+  case METHOD_VAA_STATISTICS:
+    pStrategy = WelsDynamicCast (IStrategy*, new CVAACalculation (iCpuFlag));
+    break;
+  case METHOD_BACKGROUND_DETECTION:
+    pStrategy = WelsDynamicCast (IStrategy*, new CBackgroundDetection (iCpuFlag));
+    break;
+  case METHOD_ADAPTIVE_QUANT:
+    pStrategy = WelsDynamicCast (IStrategy*, new CAdaptiveQuantization (iCpuFlag));
+    break;
+  case METHOD_COMPLEXITY_ANALYSIS:
+    pStrategy = WelsDynamicCast (IStrategy*, new CComplexityAnalysis (iCpuFlag));
+    break;
+  case METHOD_IMAGE_ROTATE:
+    pStrategy = WelsDynamicCast (IStrategy*, new CImageRotating (iCpuFlag));
+    break;
+  default:
+    break;
+  }
+
+  return pStrategy;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/common/WelsFrameWork.h
@@ -1,0 +1,130 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  WelsFrameWork.h
+ *
+ * \brief	    :  framework of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_WELSFRAMEWORK_H
+#define WELSVP_WELSFRAMEWORK_H
+
+#include "../../interface/IWelsVP.h"
+#include "util.h"
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult CreateSpecificVpInterface (IWelsVP** ppCtx);
+EResult DestroySpecificVpInterface (IWelsVP* pCtx);
+
+EResult CreateSpecificVpInterface (IWelsVPc** ppCtx);
+EResult DestroySpecificVpInterface (IWelsVPc* pCtx);
+
+#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
+
+class IStrategy : public IWelsVP {
+ public:
+  IStrategy() {
+    m_eMethod  = METHOD_NULL;
+    m_eFormat  = VIDEO_FORMAT_I420;
+    m_iIndex   = 0;
+    m_bInit    = FALSE;
+  };
+
+  virtual ~IStrategy() {}
+
+ public:
+  virtual EResult Init (int32_t iType, void* pCfg)  {
+    return RET_SUCCESS;
+  }
+  virtual EResult Uninit (int32_t iType)              {
+    return RET_SUCCESS;
+  }
+  virtual EResult Flush (int32_t iType)               {
+    return RET_SUCCESS;
+  }
+  virtual EResult Get (int32_t iType, void* pParam) {
+    return RET_SUCCESS;
+  }
+  virtual EResult Set (int32_t iType, void* pParam) {
+    return RET_SUCCESS;
+  }
+  virtual EResult SpecialFeature (int32_t iType, void* pIn, void* pOut) {
+    return RET_SUCCESS;
+  }
+  virtual EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) = 0;
+
+ public:
+  EMethods       m_eMethod;
+  EVideoFormat m_eFormat;
+  int32_t           m_iIndex;
+  bool_t            m_bInit;
+};
+
+class CVpFrameWork : public IWelsVP {
+ public:
+  CVpFrameWork (uint32_t uiThreadsNum, EResult& ret);
+  ~CVpFrameWork();
+
+ public:
+  EResult Init (int32_t iType, void* pCfg);
+
+  EResult Uninit (int32_t iType);
+
+  EResult Flush (int32_t iType);
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+  EResult Get (int32_t iType, void* pParam);
+
+  EResult Set (int32_t iType, void* pParam);
+
+  EResult SpecialFeature (int32_t iType, void* pIn, void* pOut);
+
+ private:
+  bool_t  CheckValid (EMethods eMethod, SPixMap& sSrc, SPixMap& sDst);
+  IStrategy* CreateStrategy (EMethods eMethod, int32_t iCpuFlag);
+
+ private:
+  IStrategy* m_pStgChain[MAX_STRATEGY_NUM];
+
+  WELS_MUTEX m_mutes;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/WelsFrameWorkEx.cpp
@@ -1,0 +1,96 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+
+///////////////////////////////////////////////////////////////////////
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult Init (void* pCtx, int32_t iType, void* pCfg) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Init (iType, pCfg) : RET_INVALIDPARAM;
+}
+EResult Uninit (void* pCtx, int32_t iType) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Uninit (iType) : RET_INVALIDPARAM;
+}
+EResult Flush (void* pCtx, int32_t iType) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Flush (iType) : RET_INVALIDPARAM;
+}
+EResult Process (void* pCtx, int32_t iType, SPixMap* pSrc, SPixMap* dst) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Process (iType, pSrc, dst) : RET_INVALIDPARAM;
+}
+EResult Get (void* pCtx, int32_t iType, void* pParam) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Get (iType, pParam) : RET_INVALIDPARAM;
+}
+EResult Set (void* pCtx, int32_t iType, void* pParam) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Set (iType, pParam) : RET_INVALIDPARAM;
+}
+EResult SpecialFeature (void* pCtx, int32_t iType, void* pIn, void* pOut) {
+  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->SpecialFeature (iType, pIn, pOut) : RET_INVALIDPARAM;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface (IWelsVPc** pCtx) {
+  EResult  ret     = RET_FAILED;
+  IWelsVP* pWelsVP = NULL;
+
+  ret = CreateSpecificVpInterface (&pWelsVP);
+  if (ret == RET_SUCCESS) {
+    IWelsVPc* pVPc = new IWelsVPc;
+    if (pVPc) {
+      pVPc->Init    = Init;
+      pVPc->Uninit  = Uninit;
+      pVPc->Flush   = Flush;
+      pVPc->Process = Process;
+      pVPc->Get     = Get;
+      pVPc->Set     = Set;
+      pVPc->SpecialFeature = SpecialFeature;
+      pVPc->pCtx       = WelsStaticCast (void*, pWelsVP);
+      *pCtx            = pVPc;
+    } else
+      ret = RET_OUTOFMEMORY;
+  }
+
+  return ret;
+}
+
+EResult DestroySpecificVpInterface (IWelsVPc* pCtx) {
+  if (pCtx) {
+    DestroySpecificVpInterface (WelsStaticCast (IWelsVP*, pCtx->pCtx));
+    _SafeDelete (pCtx);
+  }
+
+  return RET_SUCCESS;
+}
+
+WELSVP_NAMESPACE_END
binary files /dev/null b/codec/processing/src/common/WelsVP.aps differ
--- /dev/null
+++ b/codec/processing/src/common/WelsVP.def
@@ -1,0 +1,36 @@
+;*!
+;* \copy
+;*     Copyright (c)  2011-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY		    welsvp.dll
+EXPORTS
+                CreateVpInterface    PRIVATE
+                DestroyVpInterface   PRIVATE
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/src/common/WelsVP.rc
@@ -1,0 +1,115 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "windows.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// Chinese (P.R.C.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
+#ifdef _WIN32
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+#pragma code_page(936)
+#endif //_WIN32
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE
+BEGIN
+    "#include ""windows.h""\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+#endif    // Chinese (P.R.C.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+#ifdef _WIN32
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+#endif //_WIN32
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 0,0,0,0
+ PRODUCTVERSION 0,0,0,0
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904b0"
+        BEGIN
+            VALUE "Comments", "Cisco OpenH264  video preprocessing"
+            VALUE "CompanyName", "Cisco Systems"
+            VALUE "FileDescription", "Cisco OpenH264  video preprocessing"
+            VALUE "FileVersion", "0, 0, 0, 0"
+            VALUE "InternalName", "welsvp.dll"
+            VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
+            VALUE "OriginalFilename", "welsvp.dll"
+            VALUE "ProductName", "Cisco OpenH264 video preprocessing"
+            VALUE "ProductVersion", "0, 0, 0, 0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1200
+    END
+END
+
+#endif    // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
--- /dev/null
+++ b/codec/processing/src/common/cpu.cpp
@@ -1,0 +1,196 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	cpu.c
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "util.h"
+#include "cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define    CPU_Vender_AMD    "AuthenticAMD"
+#define    CPU_Vender_INTEL  "GenuineIntel"
+#define    CPU_Vender_CYRIX  "CyrixInstead"
+
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  uint32_t uiCPU = 0;
+  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+  int32_t  CacheLineSize = 0;
+  int8_t   chVenderName[16] = { 0 };
+
+  if (!WelsCPUIdVerify()) {
+    /* cpuid is not supported in cpu */
+    return 0;
+  }
+
+  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
+  if (uiFeatureA == 0) {
+    /* maximum input value for basic cpuid information */
+    return 0;
+  }
+
+  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+  if ((uiFeatureD & 0x00800000) == 0) {
+    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+    return 0;
+  }
+
+  uiCPU = WELS_CPU_MMX;
+  if (uiFeatureD & 0x02000000) {
+    /* SSE technology is identical to AMD MMX extensions */
+    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+  }
+  if (uiFeatureD & 0x04000000) {
+    /* SSE2 support here */
+    uiCPU |= WELS_CPU_SSE2;
+  }
+  if (uiFeatureD & 0x00000001) {
+    /* x87 FPU on-chip checking */
+    uiCPU |= WELS_CPU_FPU;
+  }
+  if (uiFeatureD & 0x00008000) {
+    /* CMOV instruction checking */
+    uiCPU |= WELS_CPU_CMOV;
+  }
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    if (uiFeatureD & 0x10000000) {
+      /* Multi-Threading checking: contains of multiple logic processors */
+      uiCPU |= WELS_CPU_HTT;
+    }
+  }
+
+  if (uiFeatureC & 0x00000001) {
+    /* SSE3 support here */
+    uiCPU |= WELS_CPU_SSE3;
+  }
+  if (uiFeatureC & 0x00000200) {
+    /* SSSE3 support here */
+    uiCPU |= WELS_CPU_SSSE3;
+  }
+  if (uiFeatureC & 0x00080000) {
+    /* SSE4.1 support here, 45nm Penryn processor */
+    uiCPU |= WELS_CPU_SSE41;
+  }
+  if (uiFeatureC & 0x00100000) {
+    /* SSE4.2 support here, next generation Nehalem processor */
+    uiCPU |= WELS_CPU_SSE42;
+  }
+  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {	//
+    /* AVX supported */
+    uiCPU |= WELS_CPU_AVX;
+  }
+  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {	//
+    /* AVX FMA supported */
+    uiCPU |= WELS_CPU_FMA;
+  }
+  if (uiFeatureC & 0x02000000) {
+    /* AES checking */
+    uiCPU |= WELS_CPU_AES;
+  }
+  if (uiFeatureC & 0x00400000) {
+    /* MOVBE checking */
+    uiCPU |= WELS_CPU_MOVBE;
+  }
+
+  if (pNumberOfLogicProcessors != NULL) {
+    // HTT enabled on chip
+    *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+  }
+
+  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
+      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    if (uiFeatureD & 0x00400000) {
+      uiCPU |= WELS_CPU_MMXEXT;
+    }
+    if (uiFeatureD & 0x80000000) {
+      uiCPU |= WELS_CPU_3DNOW;
+    }
+  }
+
+  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
+    int32_t  family, model;
+
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+    }
+  }
+
+  // get cache line size
+  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
+      || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+    CacheLineSize = (uiFeatureB & 0xff00) >>
+                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+    if (CacheLineSize == 128) {
+      uiCPU |= WELS_CPU_CACHELINE_128;
+    } else if (CacheLineSize == 64) {
+      uiCPU |= WELS_CPU_CACHELINE_64;
+    } else if (CacheLineSize == 32) {
+      uiCPU |= WELS_CPU_CACHELINE_32;
+    } else if (CacheLineSize == 16) {
+      uiCPU |= WELS_CPU_CACHELINE_16;
+    }
+  }
+
+  return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+    WelsEmms();
+  }
+}
+
+#endif
+
+
+WELSVP_NAMESPACE_END
+
+
--- /dev/null
+++ b/codec/processing/src/common/cpu.h
@@ -1,0 +1,102 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	cpu.h
+ *
+ * \brief	CPU feature compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_CPU_H
+#define WELSVP_CPU_H
+
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+/*
+ *	WELS CPU feature flags
+ */
+#define WELS_CPU_MMX        0x00000001    /* mmx */
+#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
+#define WELS_CPU_SSE        0x00000004    /* sse */
+#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
+#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
+#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
+#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
+#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
+#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
+#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
+#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
+#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature: 
+										   physical processor package is capable of supporting more than one logic processor
+										*/
+#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
+										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+										*/
+#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
+#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
+#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
+
+/*
+ *	Interfaces for CPU core feature detection as below
+ */
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+
+int32_t WelsCPUIdVerify();
+
+void  WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
+
+void  WelsEmms();
+
+WELSVP_EXTERN_C_END
+#endif
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/memory.cpp
@@ -1,0 +1,117 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "memory.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+void* WelsMalloc (const uint32_t kuiSize, str_t* pTag) {
+  const int32_t kiSizeVoidPointer	= sizeof (void**);
+  const int32_t kiSizeInt32		= sizeof (int32_t);
+  const int32_t kiAlignedBytes	= ALIGNBYTES - 1;
+
+  uint8_t* pBuf		= (uint8_t*) ::malloc (kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
+  uint8_t* pAlignedBuf = NULL;
+
+  if (NULL == pBuf)
+    return NULL;
+
+  // to fill zero values
+  WelsMemset (pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
+
+  pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
+  pAlignedBuf -= WelsCastFromPointer (pAlignedBuf) & kiAlignedBytes;
+  * ((void**) (pAlignedBuf - kiSizeVoidPointer)) = pBuf;
+  * ((int32_t*) (pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32))) = kuiSize;
+
+  return (pAlignedBuf);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void WelsFree (void* pPointer, str_t* pTag) {
+  if (pPointer) {
+    ::free (* (((void**) pPointer) - 1));
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void* InternalReallocate (void* pPointer, const uint32_t kuiSize, str_t* pTag) {
+  uint32_t iOldSize = 0;
+  uint8_t* pNew = NULL;
+  if (pPointer != NULL)
+    iOldSize = * ((int32_t*) ((uint8_t*) pPointer - sizeof (void**) - sizeof (int32_t)));
+  else
+    return WelsMalloc (kuiSize, pTag);
+
+  pNew = (uint8_t*)WelsMalloc (kuiSize, pTag);
+  if (0 == pNew) {
+    if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
+      return (pPointer);
+    return 0;
+  } else if (iOldSize > 0 && kuiSize > 0)
+    memcpy (pNew, pPointer, (iOldSize < kuiSize) ? iOldSize : kuiSize);
+  else
+    return 0;
+
+  WelsFree (pPointer, pTag);
+  return (pNew);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag) {
+  const uint32_t kuiOldSize = *pRealSize;
+  uint32_t kuiNewSize = 0;
+  void* pLocalPointer = NULL;
+  if (kuiOldSize >= kuiSize)	// large enough of original block, so do nothing
+    return (pPointer);
+
+  // new request
+  kuiNewSize = kuiSize + 15;
+  kuiNewSize -= (kuiNewSize & 15);
+  kuiNewSize += 32;
+
+  pLocalPointer = InternalReallocate (pPointer, kuiNewSize, pTag);
+  if (NULL != pLocalPointer) {
+    *pRealSize	= kuiNewSize;
+    return (pLocalPointer);
+  } else {
+    return NULL;
+  }
+
+  return NULL;	// something wrong
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/common/memory.h
@@ -1,0 +1,110 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  memory.h
+ *
+ * \brief	    :  memory definition for wels video processor class
+ *
+ * \date        :  2011/02/22
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_MEMORY_H
+#define WELSVP_MEMORY_H
+
+#include "util.h"
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+inline_t void* WelsMemset (void* pPointer, int32_t iValue, uint32_t uiSize) {
+  return ::memset (pPointer, iValue, uiSize);
+}
+
+inline_t void* WelsMemcpy (void* pDst, const void* kpSrc, uint32_t uiSize) {
+  return ::memcpy (pDst, kpSrc, uiSize);
+}
+
+inline_t int32_t WelsMemcmp (const void* kpBuf1, const void* kpBuf2, uint32_t uiSize) {
+  return ::memcmp (kpBuf1, kpBuf2, uiSize);
+}
+
+/*!
+*************************************************************************************
+* \brief	malloc with zero filled utilization in Wels
+*
+* \param 	i_size	uiSize of memory block required
+*
+* \return	allocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void* WelsMalloc (const uint32_t kuiSize, str_t* pTag = NULL);
+
+/*!
+*************************************************************************************
+* \brief	free utilization in Wels
+*
+* \param 	p	data pointer to be free.
+*			i.e, uint8_t *p = actual data to be free, argv = &p.
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void WelsFree (void* pPointer, str_t* pTag = NULL);
+
+/*!
+*************************************************************************************
+* \brief	reallocation in Wels. Do nothing and continue using old block
+*		in case the block is large enough currently
+*
+* \param 	p	    memory block required in old time
+* \param	i_size	new uiSize of memory block requested
+* \param	sz_real	pointer to the old uiSize of memory block
+*
+* \return	reallocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void* WelsRealloc (void*  pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag = NULL);
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- /dev/null
+++ b/codec/processing/src/common/resource.h
@@ -1,0 +1,15 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by WelsVP.rc
+//
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
--- /dev/null
+++ b/codec/processing/src/common/thread.cpp
@@ -1,0 +1,93 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	thread.cpp
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  InitializeCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  EnterCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  LeaveCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  DeleteCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+#elif  defined(__GNUC__)
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  return pthread_mutex_init (mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  return pthread_mutex_lock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  return pthread_mutex_unlock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  return pthread_mutex_destroy (mutex);
+}
+
+#endif
+
+WELSVP_NAMESPACE_END
+
+
+
--- /dev/null
+++ b/codec/processing/src/common/thread.h
@@ -1,0 +1,89 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	thread.h
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_THREAD_H
+#define WELSVP_THREAD_H
+
+#include "typedef.h"
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+#elif defined(__GNUC__)
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+
+#endif//WIN32
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+
+typedef  HANDLE            WELS_THREAD_HANDLE;
+typedef  CRITICAL_SECTION  WELS_MUTEX;
+
+#elif defined(__GNUC__)
+
+typedef   pthread_t         WELS_THREAD_HANDLE;
+typedef   pthread_mutex_t   WELS_MUTEX;
+
+#endif
+
+typedef long_t WELS_THREAD_ERROR_CODE;
+
+#define   WELS_THREAD_ERROR_OK					0
+#define   WELS_THREAD_ERROR_GENERIAL			((unsigned long)(-1))
+#define   WELS_THREAD_ERROR_WAIT_OBJECT_0		0
+#define	  WELS_THREAD_ERROR_WAIT_TIMEOUT		((unsigned long)0x00000102L)
+#define	  WELS_THREAD_ERROR_WAIT_FAILED		    WELS_THREAD_ERROR_GENERIAL
+
+WELS_THREAD_ERROR_CODE   WelsMutexInit (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE   WelsMutexLock (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE   WelsMutexUnlock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE   WelsMutexDestroy (WELS_MUTEX* mutex);
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/typedef.h
@@ -1,0 +1,102 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  typedef.h
+ *
+ * \brief	    :  basic type definition
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. Define basic type with platform-independent;
+ *                 2. Define specific namespace to avoid name pollution;
+ *                 3. C++ ONLY;
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_TYPEDEF_H
+#define WELSVP_TYPEDEF_H
+
+#define WELSVP_EXTERN_C_BEGIN       extern "C" {
+#define WELSVP_EXTERN_C_END         }
+
+#define WELSVP_NAMESPACE_BEGIN      namespace nsWelsVP {
+#define WELSVP_NAMESPACE_END        }
+
+WELSVP_NAMESPACE_BEGIN
+
+#if ( defined(_WIN32) || defined(_WIN32) ) && defined(_MSC_VER)
+
+typedef char               int8_t   ;
+typedef unsigned char      uint8_t  ;
+typedef short              int16_t  ;
+typedef unsigned short     uint16_t ;
+typedef int                int32_t  ;
+typedef unsigned int       uint32_t ;
+typedef __int64            int64_t  ;
+typedef unsigned __int64   uint64_t ;
+#define inline_t           _inline
+
+#else	// GCC
+
+typedef signed char        int8_t
+; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit
+typedef unsigned char      uint8_t  ;
+typedef signed short       int16_t  ;
+typedef unsigned short     uint16_t ;
+typedef signed int         int32_t  ;
+typedef unsigned int       uint32_t ;
+typedef long long          int64_t  ;
+typedef unsigned long long uint64_t ;
+#define inline_t           inline
+
+#endif
+
+typedef char    str_t    ; // [comment]: specific use plain char only for character parameters
+typedef long    long_t   ;
+typedef int32_t bool_t   ;
+
+#if defined(_WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
+typedef float   float_t  ;
+typedef double  double_t ;
+#endif
+
+#ifndef NULL
+#define NULL    0
+#endif
+
+enum {
+  FALSE = 0,
+  TRUE  = !FALSE
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/util.cpp
@@ -1,0 +1,45 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "util.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+
+int32_t  WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2) {
+  return ::strcmp (kpStr1, kpStr2);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/common/util.h
@@ -1,0 +1,107 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  util.h
+ *
+ * \brief	    :  utils for wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_UTIL_H
+#define WELSVP_UTIL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include "typedef.h"
+#include "memory.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define MAX_WIDTH      (4096)
+#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
+#define MB_WIDTH_LUMA  (16)
+#define PESN		   (1e-6)	// desired float precision
+
+#define MB_TYPE_INTRA4x4		0x00000001
+#define MB_TYPE_INTRA16x16	0x00000002
+#define MB_TYPE_INTRA_PCM		0x00000004
+#define MB_TYPE_INTRA			  (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
+#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
+
+#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
+#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
+#define WELS_SIGN(a)	((long_t)(a) >> 31)
+#define WELS_ABS(a)		((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
+#define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)
+
+#define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */
+#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
+#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)
+#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)
+#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)
+
+#define WelsCastFromPointer(p)      (reinterpret_cast<long_t>(p))
+#define WelsStaticCast(type, p)  (static_cast<type>(p))
+#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
+
+#define GET_METHOD(x)  ((x) & 0xff)          // mask method as the lowest 8bits
+#define GET_SPECIAL(x) (((x) >> 8) & 0xff)   // mask special flag as 8bits
+
+inline_t EMethods WelsVpGetValidMethod (int32_t a) {
+  int32_t iMethod = GET_METHOD (a);
+  return WelsStaticCast (EMethods, WELS_CLAMP (iMethod, METHOD_NULL + 1, METHOD_MASK - 1));
+}
+
+
+#define _SafeFree(p)		if (p) { WelsFree(p); (p) = NULL; }
+#define _SafeDelete(p)		if (p) { delete (p); (p) = NULL; }
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+
+int32_t   WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2);
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- /dev/null
+++ b/codec/processing/src/complexityanalysis/ComplexityAnalysis.cpp
@@ -1,0 +1,304 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "ComplexityAnalysis.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
+  m_eMethod   = METHOD_COMPLEXITY_ANALYSIS;
+  m_pfGomSad   = NULL;
+  WelsMemset (&m_sComplexityAnalysisParam, 0, sizeof (m_sComplexityAnalysisParam));
+}
+
+CComplexityAnalysis::~CComplexityAnalysis() {
+}
+
+EResult CComplexityAnalysis::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_SUCCESS;
+
+  switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode) {
+  case FRAME_SAD:
+    AnalyzeFrameComplexityViaSad (pSrcPixMap, pRefPixMap);
+    break;
+  case GOM_SAD:
+    AnalyzeGomComplexityViaSad (pSrcPixMap, pRefPixMap);
+    break;
+  case GOM_VAR:
+    AnalyzeGomComplexityViaVar (pSrcPixMap, pRefPixMap);
+    break;
+  default:
+    eReturn = RET_INVALIDPARAM;
+    break;
+  }
+
+  return eReturn;
+}
+
+
+EResult CComplexityAnalysis::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  m_sComplexityAnalysisParam = * (SComplexityAnalysisParam*)pParam;
+
+  return RET_SUCCESS;
+}
+
+EResult CComplexityAnalysis::Get (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  SComplexityAnalysisParam* sComplexityAnalysisParam = (SComplexityAnalysisParam*)pParam;
+
+  sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
+
+  return RET_SUCCESS;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+void CComplexityAnalysis::AnalyzeFrameComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  SVAACalcResult*     pVaaCalcResults = NULL;
+  pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+
+  m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
+
+  if (m_sComplexityAnalysisParam.iCalcBgd) { //BGD control
+    m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground (pSrcPixMap, pRefPixMap);
+  }
+}
+
+int32_t CComplexityAnalysis::GetFrameSadExcludeBackground (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbNum    = iMbWidth * iMbHeight;
+
+  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
+
+  uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+  uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
+  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+  int32_t*  pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+
+  uint32_t uiFrameSad = 0;
+  for (int32_t j = 0; j < iGomMbNum; j ++) {
+    iGomMbStartIndex = j * iMbNumInGom;
+    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+
+    for (int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++) {
+      if (pBackgroundMbFlag[i] == 0 || IS_INTRA (uiRefMbType[i])) {
+        pGomForegroundBlockNum[j]++;
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
+        uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
+      }
+    }
+  }
+
+  return (uiFrameSad);
+}
+
+
+void InitGomSadFunc (PGOMSadFunc& pfGomSad, uint8_t iCalcBgd) {
+  pfGomSad = GomSampleSad;
+
+  if (iCalcBgd) {
+    pfGomSad = GomSampleSadExceptBackground;
+  }
+}
+
+void GomSampleSad (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8, uint8_t pBackgroundMbFlag) {
+  (*pGomForegroundBlockNum) ++;
+  *pGomSad += pSad8x8[0];
+  *pGomSad += pSad8x8[1];
+  *pGomSad += pSad8x8[2];
+  *pGomSad += pSad8x8[3];
+}
+
+void GomSampleSadExceptBackground (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
+                                   uint8_t pBackgroundMbFlag) {
+  if (pBackgroundMbFlag == 0) {
+    (*pGomForegroundBlockNum) ++;
+    *pGomSad += pSad8x8[0];
+    *pGomSad += pSad8x8[1];
+    *pGomSad += pSad8x8[2];
+    *pGomSad += pSad8x8[3];
+  }
+}
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbNum    = iMbWidth * iMbHeight;
+
+  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+
+  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+  int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+  int32_t iStartSampleIndex = 0;
+
+  uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+  uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
+  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+  int32_t*  pGomForegroundBlockNum = (int32_t*)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+  int32_t*  pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
+
+  uint8_t* pRefY = NULL, *pSrcY = NULL;
+  int32_t iRefStride = 0, iCurStride = 0;
+
+  uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
+  uint32_t uiGomSad = 0, uiFrameSad = 0;
+
+  pRefY = (uint8_t*)pRefPixMap->pPixel[0];
+  pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+  iRefStride  = pRefPixMap->iStride[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  InitGomSadFunc (m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd);
+
+  for (int32_t j = 0; j < iGomMbNum; j ++) {
+    uiGomSad = 0;
+
+    iGomMbStartIndex = j * iMbNumInGom;
+    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+    iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth  - iGomMbStartIndex / iMbWidth;
+
+    iMbStartIndex = iGomMbStartIndex;
+    iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+    iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
+                         MB_WIDTH_LUMA;
+
+    do {
+      pRefTmp = pRefY + iStartSampleIndex;
+      pCurTmp = pSrcY + iStartSampleIndex;
+
+      for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
+        m_pfGomSad (&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i]
+                    && !IS_INTRA (uiRefMbType[i]));
+      }
+
+      iMbStartIndex = iMbEndIndex;
+      iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth , iGomMbEndIndex);
+
+      iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
+                           MB_WIDTH_LUMA;
+
+    } while (--iGomMbRowNum);
+
+    pGomComplexity[j] = uiGomSad;
+    uiFrameSad += pGomComplexity[j];
+  }
+
+  m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
+}
+
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaVar (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
+  int32_t iMbWidth  = iWidth  >> 4;
+  int32_t iMbHeight = iHeight >> 4;
+  int32_t iMbNum    = iMbWidth * iMbHeight;
+
+  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+  int32_t iGomSampleNum = 0;
+
+  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+  int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+  int32_t iStartSampleIndex = 0;
+
+  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+  int32_t*  pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
+
+  uint8_t* pSrcY = NULL;
+  int32_t iCurStride = 0;
+
+  uint8_t* pCurTmp = NULL;
+  uint32_t uiSampleSum = 0, uiSquareSum = 0;
+
+  pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  for (int32_t j = 0; j < iGomMbNum; j ++) {
+    uiSampleSum = 0;
+    uiSquareSum = 0;
+
+    iGomMbStartIndex = j * iMbNumInGom;
+    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+    iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth  - iGomMbStartIndex / iMbWidth;
+
+    iMbStartIndex = iGomMbStartIndex;
+    iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+    iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
+                         MB_WIDTH_LUMA;
+    iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
+
+    do {
+      pCurTmp = pSrcY + iStartSampleIndex;
+
+      for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
+        uiSampleSum += pVaaCalcResults->pSum16x16[i];
+        uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
+      }
+
+      iMbStartIndex = iMbEndIndex;
+      iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth, iGomMbEndIndex);
+
+      iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
+                           MB_WIDTH_LUMA;
+    } while (--iGomMbRowNum);
+
+    pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
+  }
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/complexityanalysis/ComplexityAnalysis.h
@@ -1,0 +1,83 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file	        :  ComplexityAnalysis.h
+*
+* \brief	    :  complexity analysis class of wels video processor class
+*
+* \date         :  2011/03/28
+*
+* \description  :  1. rewrite the package code of complexity analysis class
+*
+*************************************************************************************
+*/
+
+#ifndef WELSVP_COMPLEXITYANALYSIS_H
+#define WELSVP_COMPLEXITYANALYSIS_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef  void (GOMSadFunc) (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
+                            uint8_t pBackgroundMbFlag);
+
+typedef GOMSadFunc*   PGOMSadFunc;
+
+GOMSadFunc      GomSampleSad;
+GOMSadFunc      GomSampleSadExceptBackground;
+
+class CComplexityAnalysis : public IStrategy {
+ public:
+  CComplexityAnalysis (int32_t iCpuFlag);
+  ~CComplexityAnalysis();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Set (int32_t iType, void* pParam);
+  EResult Get (int32_t iType, void* pParam);
+
+ private:
+  void AnalyzeFrameComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
+  int32_t GetFrameSadExcludeBackground (SPixMap* pSrc, SPixMap* pRef);
+
+  void AnalyzeGomComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
+  void AnalyzeGomComplexityViaVar (SPixMap* pSrc, SPixMap* pRef);
+
+ private:
+  PGOMSadFunc m_pfGomSad;
+  SComplexityAnalysisParam m_sComplexityAnalysisParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/denoise/denoise.cpp
@@ -1,0 +1,124 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "denoise.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define CALC_BI_STRIDE(iWidth, iBitcount)  ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDenoiser::CDenoiser (int32_t iCpuFlag) {
+  m_CPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_DENOISE;
+  WelsMemset (&m_pfDenoise, 0, sizeof (m_pfDenoise));
+
+  m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
+  m_fSigmaGrey  = DENOISE_GRAY_SIGMA;
+  m_uiType		 = DENOISE_ALL_COMPONENT;
+  InitDenoiseFunc (m_pfDenoise, m_CPUFlag);
+}
+
+CDenoiser::~CDenoiser() {
+}
+
+void CDenoiser::InitDenoiseFunc (SDenoiseFuncs& denoiser,  int32_t iCpuFlag) {
+  denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
+  denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
+#if defined(X86_ASM)
+  if (iCpuFlag & WELS_CPU_SSE2) {
+    denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
+    denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;
+  }
+#endif
+}
+
+EResult CDenoiser::Process (int32_t iType, SPixMap* pSrc, SPixMap* dst) {
+  uint8_t* pSrcY = (uint8_t*)pSrc->pPixel[0];
+  uint8_t* pSrcU = (uint8_t*)pSrc->pPixel[1];
+  uint8_t* pSrcV = (uint8_t*)pSrc->pPixel[2];
+  if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  int32_t iWidthY = pSrc->sRect.iRectWidth;
+  int32_t iHeightY = pSrc->sRect.iRectHeight;
+  int32_t iWidthUV = iWidthY >> 1;
+  int32_t iHeightUV = iHeightY >> 1;
+
+  if (m_uiType & DENOISE_Y_COMPONENT)
+    BilateralDenoiseLuma (pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
+
+  if (m_uiType & DENOISE_U_COMPONENT)
+    WaverageDenoiseChroma (pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
+
+  if (m_uiType & DENOISE_V_COMPONENT)
+    WaverageDenoiseChroma (pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
+
+  return RET_SUCCESS;
+}
+
+void CDenoiser::BilateralDenoiseLuma (uint8_t* pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride) {
+  int32_t w;
+
+  pSrcY = pSrcY + m_uiSpaceRadius * iStride;
+  for (int32_t h = m_uiSpaceRadius; h < iHeight - m_uiSpaceRadius; h++) {
+    for (w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w += 8) {
+      m_pfDenoise.pfBilateralLumaFilter8 (pSrcY + w, iStride);
+    }
+    for (w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++) {
+      Gauss3x3Filter (pSrcY + w, iStride);
+    }
+    pSrcY += iStride;
+  }
+}
+
+void CDenoiser::WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride) {
+  int32_t w;
+
+  pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
+  for (int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++) {
+    for (w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w += 8) {
+      m_pfDenoise.pfWaverageChromaFilter8 (pSrcUV + w, iStride);
+    }
+
+    for (w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++) {
+      Gauss3x3Filter (pSrcUV + w, iStride);
+    }
+    pSrcUV += iStride;
+  }
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/denoise/denoise.h
@@ -1,0 +1,111 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  denoise.h
+ *
+ * \brief	    :  denoise class of wels video processor class
+ *
+ * \date        :  2011/03/15
+ *
+ * \description :  1. rewrite the package code of denoise class
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_DENOISE_H
+#define WELSVP_DENOISE_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+
+#define DENOISE_GRAY_RADIUS (1)
+#define DENOISE_GRAY_SIGMA  (2)
+
+#define UV_WINDOWS_RADIUS   (2)
+#define TAIL_OF_LINE8		(7)
+
+#define DENOISE_Y_COMPONENT (1)
+#define DENOISE_U_COMPONENT (2)
+#define DENOISE_V_COMPONENT (4)
+#define DENOISE_ALL_COMPONENT (7)
+
+
+WELSVP_NAMESPACE_BEGIN
+
+void Gauss3x3Filter (uint8_t* pixels, int32_t stride);
+
+typedef void (DenoiseFilterFunc) (uint8_t* pixels, int32_t stride);
+
+typedef DenoiseFilterFunc* DenoiseFilterFuncPtr;
+
+DenoiseFilterFunc     BilateralLumaFilter8_c;
+DenoiseFilterFunc     WaverageChromaFilter8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+DenoiseFilterFunc     BilateralLumaFilter8_sse2 ;
+DenoiseFilterFunc     WaverageChromaFilter8_sse2 ;
+WELSVP_EXTERN_C_END
+#endif
+
+typedef  struct TagDenoiseFuncs {
+  DenoiseFilterFuncPtr	pfBilateralLumaFilter8;//on 8 samples
+  DenoiseFilterFuncPtr	pfWaverageChromaFilter8;//on 8 samples
+} SDenoiseFuncs;
+
+class CDenoiser : public IStrategy {
+ public:
+  CDenoiser (int32_t iCpuFlag);
+  ~CDenoiser();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* dst);
+
+ private:
+  void InitDenoiseFunc (SDenoiseFuncs& pf, int32_t cpu);
+  void BilateralDenoiseLuma (uint8_t* p_y_data, int32_t width, int32_t height, int32_t stride);
+  void WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t width, int32_t height, int32_t stride);
+
+ private:
+  float_t	 m_fSigmaGrey;			//sigma for grey scale similarity, suggestion 2.5-3
+  uint32_t  m_uiFilterWindow;				//filter window diameter
+  uint16_t	 m_uiSpaceRadius;			//filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
+  uint16_t	 m_uiType;					//do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
+  uint32_t*  m_pGreyWeightTable;		//weight table for grey scale
+
+  SDenoiseFuncs m_pfDenoise;
+  int32_t      m_CPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/denoise/denoise_filter.cpp
@@ -1,0 +1,127 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	svc_preprocess.h
+ *
+ * \brief	svc denoising
+ *
+ * \date	4/1/2010 Created
+ *
+ */
+
+#include "denoise.h"
+#include "../common/typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void	BilateralLumaFilter8_c (uint8_t* pSample, int32_t iStride) {
+  int32_t nSum = 0, nTotWeight = 0;
+  int32_t iCenterSample = *pSample;
+  uint8_t* pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
+  int32_t x, y;
+  int32_t iCurSample, iCurWeight, iGreyDiff;
+  uint8_t aSample[8];
+
+  for (int32_t i = 0; i < 8; i++) {
+    nSum = 0;
+    nTotWeight = 0;
+    iCenterSample = *pSample;
+    pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
+    for (y = 0; y < 3; y++) {
+      for (x = 0; x < 3; x++) {
+        if (x == 1 && y == 1) continue;			// except center point
+        iCurSample = pCurLine[x];
+        iCurWeight = WELS_ABS (iCurSample - iCenterSample);
+        iGreyDiff = 32 - iCurWeight;
+        if (iGreyDiff < 0)	continue;
+        else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
+        nSum += iCurSample * iCurWeight;
+        nTotWeight +=  iCurWeight;
+      }
+      pCurLine += iStride;
+    }
+    nTotWeight = 256 - nTotWeight;
+    nSum += iCenterSample * nTotWeight;
+    aSample[i] = nSum >> 8;
+    pSample++;
+  }
+  WelsMemcpy (pSample - 8, aSample, 8);
+}
+
+
+/***************************************************************************
+5x5 filter:
+1	1	2	1	1
+1	2	4	2	1
+2	4	20	4	2
+1	2	4	2	1
+1	1	2	1	1
+***************************************************************************/
+#define SUM_LINE1(pSample)	(pSample[0] +(pSample[1]) +(pSample[2]<<1)  + pSample[3] + pSample[4])
+#define SUM_LINE2(pSample)	(pSample[0] +(pSample[1]<<1) +(pSample[2]<<2)  +(pSample[3]<<1) +pSample[4])
+#define SUM_LINE3(pSample)	((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20)  +(pSample[3]<<2) +(pSample[4]<<1))
+void	WaverageChromaFilter8_c (uint8_t* pSample, int32_t iStride) {
+  int32_t sum;
+  uint8_t* pStartPixels = pSample - UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
+  uint8_t* pCurLine1 = pStartPixels;
+  uint8_t* pCurLine2 = pCurLine1 + iStride;
+  uint8_t* pCurLine3 = pCurLine2 + iStride;
+  uint8_t* pCurLine4 = pCurLine3 + iStride;
+  uint8_t* pCurLine5 = pCurLine4 + iStride;
+  uint8_t aSample[8];
+
+  for (int32_t i = 0; i < 8; i++) {
+    sum = SUM_LINE1 ((pCurLine1 + i)) + SUM_LINE2 ((pCurLine2 + i)) + SUM_LINE3 ((pCurLine3 + i))
+          + SUM_LINE2 ((pCurLine4 + i)) + SUM_LINE1 ((pCurLine5 + i));
+    aSample[i] = (sum >> 6);
+    pSample++;
+  }
+  WelsMemcpy (pSample - 8, aSample, 8);
+}
+
+/***************************************************************************
+edge of y/uv use a 3x3 Gauss filter, radius = 1:
+1	2	1
+2	4	2
+1	2	1
+***************************************************************************/
+void	Gauss3x3Filter (uint8_t* pSrc, int32_t iStride) {
+  int32_t nSum = 0;
+  uint8_t* pCurLine1 = pSrc - iStride - 1;
+  uint8_t* pCurLine2 = pCurLine1 + iStride;
+  uint8_t* pCurLine3 = pCurLine2 + iStride;
+
+  nSum =	 pCurLine1[0]		+ (pCurLine1[1] << 1) +  pCurLine1[2]		+
+           (pCurLine2[0] << 1)	+ (pCurLine2[1] << 2) + (pCurLine2[2] << 1) +
+           pCurLine3[0]		+ (pCurLine3[1] << 1) +  pCurLine3[2];
+  *pSrc = nSum >> 4;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -1,0 +1,135 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "downsample.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDownsampling::CDownsampling (int32_t iCpuFlag) {
+  m_iCPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_DOWNSAMPLE;
+  WelsMemset (&m_pfDownsample, 0, sizeof (m_pfDownsample));
+  InitDownsampleFuncs (m_pfDownsample, m_iCPUFlag);
+}
+
+CDownsampling::~CDownsampling() {
+}
+
+void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int32_t iCpuFlag) {
+  sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+  sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
+  sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;
+#if defined(X86_ASM)
+  if (iCpuFlag & WELS_CPU_SSE) {
+  /*  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
+    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
+    sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;*/
+  }
+  if (iCpuFlag & WELS_CPU_SSE2) {
+  //  sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
+  //  sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
+  }
+  if (iCpuFlag & WELS_CPU_SSSE3) {
+  //  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
+  //  sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
+  }
+  if (iCpuFlag & WELS_CPU_SSE41) {
+  //  sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
+  //  sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
+  }
+#endif//X86_ASM
+
+}
+
+EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
+  int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
+  int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
+  int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
+  int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
+
+  int32_t iSrcWidthUV = iSrcWidthY >> 1;
+  int32_t iSrcHeightUV = iSrcHeightY >> 1;
+  int32_t iDstWidthUV = iDstWidthY >> 1;
+  int32_t iDstHeightUV = iDstHeightY >> 1;
+
+  if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY) {
+    return RET_INVALIDPARAM;
+  }
+
+  if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
+    // use half average functions
+    uint8_t iAlignIndex = 3;
+
+    iAlignIndex = GetAlignedIndex (iSrcWidthY);
+    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+        (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+    iAlignIndex = GetAlignedIndex (iSrcWidthUV);
+    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+        (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+        (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+  } else {
+    m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
+                                       (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+    m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
+                                         (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+
+    m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
+                                         (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+  }
+  return RET_SUCCESS;
+}
+
+int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
+  int32_t iAlignIndex = 3;
+  if ((kiSrcWidth & 0x1f) == 0)	// x32
+    iAlignIndex	= 0;
+  else if ((kiSrcWidth & 0x0f) == 0)	// x16
+    iAlignIndex	= 1;
+  else if ((kiSrcWidth & 0x07) == 0)	// x8
+    iAlignIndex	= 2;
+  else
+    iAlignIndex	= 3;
+  return iAlignIndex;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/downsample/downsample.h
@@ -1,0 +1,128 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  downsample.h
+ *
+ * \brief	    :  downsample class of wels video processor class
+ *
+ * \date        :  2011/03/33
+ *
+ * \description :  1. rewrite the package code of downsample class
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_DOWNSAMPLE_H
+#define WELSVP_DOWNSAMPLE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
+                                    uint8_t* pSrc, const int32_t kiSrcStride,
+                                    const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+
+typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+                                      const int32_t kiDstHeight,
+                                      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+
+typedef HalveDownsampleFunc*		PHalveDownsampleFunc;
+typedef GeneralDownsampleFunc*	PGeneralDownsampleFunc;
+
+HalveDownsampleFunc   DyadicBilinearDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
+
+typedef struct {
+  // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
+  PHalveDownsampleFunc			pfHalfAverage[4];
+  PGeneralDownsampleFunc		pfGeneralRatioLuma;
+  PGeneralDownsampleFunc		pfGeneralRatioChroma;
+} SDownsampleFuncs;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+// used for scr width is multipler of 8 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx8_sse;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse;
+// used for scr width is multipler of 16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_ssse3;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_ssse3;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse4;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse4;
+
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+
+void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+WELSVP_EXTERN_C_END
+#endif
+
+
+
+
+class CDownsampling : public IStrategy {
+ public:
+  CDownsampling (int32_t iCpuFlag);
+  ~CDownsampling();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ private:
+  void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
+
+  int32_t GetAlignedIndex (const int32_t kiSrcWidth);
+
+ private:
+  SDownsampleFuncs m_pfDownsample;
+  int32_t  m_iCPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -1,0 +1,234 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  downsample_yuv.c
+ *
+ *  Abstract
+ *      Implementation for source yuv data downsampling used before spatial encoding.
+ *
+ *  History
+ *      10/24/2008 Created
+ *
+ *****************************************************************************/
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+#include "downsample.h"
+
+
+WELSVP_NAMESPACE_BEGIN
+
+
+void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+                                  uint8_t* pSrc, const int32_t kiSrcStride,
+                                  const int32_t kiSrcWidth, const int32_t kiSrcHeight)
+
+{
+  uint8_t* pDstLine	= pDst;
+  uint8_t* pSrcLine	= pSrc;
+  const int32_t kiSrcStridex2	= kiSrcStride << 1;
+  const int32_t kiDstWidth		= kiSrcWidth >> 1;
+  const int32_t kiDstHeight	= kiSrcHeight >> 1;
+
+  for (int32_t j = 0; j < kiDstHeight; j ++) {
+    for (int32_t i = 0; i < kiDstWidth; i ++) {
+      const int32_t kiSrcX = i << 1;
+      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+    }
+    pDstLine	+= kiDstStride;
+    pSrcLine	+= kiSrcStridex2;
+  }
+}
+
+void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+                                       const int32_t kiDstHeight,
+                                       uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
+  const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
+  int32_t fScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+  int32_t fScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+  uint32_t x;
+  int32_t iYInverse, iXInverse;
+
+  uint8_t* pByDst = pDst;
+  uint8_t* pByLineDst = pDst;
+
+  iYInverse = 1 << (kuiScaleBitHeight - 1);
+  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
+    int32_t iYy = iYInverse >> kuiScaleBitHeight;
+    int32_t fv = iYInverse & (kuiScaleHeight - 1);
+
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kuiScaleBitWidth - 1);
+    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
+      int32_t iXx = iXInverse >> kuiScaleBitWidth;
+      int32_t iFu = iXInverse & (kuiScaleWidth - 1);
+
+      uint8_t* pByCurrent = pBySrc + iXx;
+      uint8_t a, b, c, d;
+
+      a = *pByCurrent;
+      b = * (pByCurrent + 1);
+      c = * (pByCurrent + kiSrcStride);
+      d = * (pByCurrent + kiSrcStride + 1);
+
+      x  = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
+      x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
+      x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
+      x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
+      x >>= (kuiScaleBitHeight - 1);
+      x += 1;
+      x >>= 1;
+      //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c +
+      //		 ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
+      x = WELS_CLAMP (x, 0, 255);
+      *pByDst++ = (uint8_t)x;
+
+      iXInverse += fScalex;
+    }
+    *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth));
+    pByLineDst += kiDstStride;
+    iYInverse += fScaley;
+  }
+
+  // last row special
+  {
+    int32_t iYy = iYInverse >> kuiScaleBitHeight;
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kuiScaleBitWidth - 1);
+    for (int32_t j = 0; j < kiDstWidth; j++) {
+      int32_t iXx = iXInverse >> kuiScaleBitWidth;
+      *pByDst++ = * (pBySrc + iXx);
+
+      iXInverse += fScalex;
+    }
+  }
+}
+
+void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+    const int32_t kiDstHeight,
+    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBit = 15;
+  const int32_t kiScale = (1 << kiScaleBit);
+  int32_t iScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
+  int32_t iScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
+  int64_t x;
+  int32_t iYInverse, iXInverse;
+
+  uint8_t* pByDst = pDst;
+  uint8_t* pByLineDst = pDst;
+
+  iYInverse = 1 << (kiScaleBit - 1);
+  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
+    int32_t iYy = iYInverse >> kiScaleBit;
+    int32_t iFv = iYInverse & (kiScale - 1);
+
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kiScaleBit - 1);
+    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
+      int32_t iXx = iXInverse >> kiScaleBit;
+      int32_t iFu = iXInverse & (kiScale - 1);
+
+      uint8_t* pByCurrent = pBySrc + iXx;
+      uint8_t a, b, c, d;
+
+      a = *pByCurrent;
+      b = * (pByCurrent + 1);
+      c = * (pByCurrent + kiSrcStride);
+      d = * (pByCurrent + kiSrcStride + 1);
+
+      x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
+             kiScale - 1 - iFu)) * iFv * c +
+           ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
+      x = WELS_CLAMP (x, 0, 255);
+      *pByDst++ = (uint8_t)x;
+
+      iXInverse += iScalex;
+    }
+    *pByDst = * (pBySrc + (iXInverse >> kiScaleBit));
+    pByLineDst += kiDstStride;
+    iYInverse += iScaley;
+  }
+
+  // last row special
+  {
+    int32_t iYy = iYInverse >> kiScaleBit;
+    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+    pByDst = pByLineDst;
+    iXInverse = 1 << (kiScaleBit - 1);
+    for (int32_t j = 0; j < kiDstWidth; j++) {
+      int32_t iXx = iXInverse >> kiScaleBit;
+      *pByDst++ = * (pBySrc + iXx);
+
+      iXInverse += iScalex;
+    }
+  }
+}
+
+
+#ifdef X86_ASM
+//void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+//    const int32_t kiDstHeight,
+//    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+//  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+//  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
+//
+//  uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+//  uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+//
+//  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+//                                       pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+//}
+//
+//void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+//    const int32_t kiDstHeight,
+//    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+//  const int32_t kiScaleBit = 15;
+//  const uint32_t kuiScale = (1 << kiScaleBit);
+//
+//  uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
+//  uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
+//
+//  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+//      pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+//}
+#endif //X86_ASM
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/imagerotate/imagerotate.cpp
@@ -1,0 +1,93 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CImageRotating::CImageRotating (int32_t iCpuFlag) {
+  m_iCPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_IMAGE_ROTATE;
+  WelsMemset (&m_pfRotateImage, 0, sizeof (m_pfRotateImage));
+  InitImageRotateFuncs (m_pfRotateImage, m_iCPUFlag);
+}
+
+CImageRotating::~CImageRotating() {
+}
+
+void CImageRotating::InitImageRotateFuncs (SImageRotateFuncs& sImageRotateFuncs, int32_t iCpuFlag) {
+  sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
+  sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
+  sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
+}
+EResult CImageRotating::ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth,
+    uint32_t iHeight, uint8_t* pDst) {
+  if (iType == 90) {
+    m_pfRotateImage.pfImageRotate90D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+  } else if (iType == 180) {
+    m_pfRotateImage.pfImageRotate180D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+  } else if (iType == 270) {
+    m_pfRotateImage.pfImageRotate270D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+  } else {
+    return RET_NOTSUPPORTED;
+  }
+  return RET_SUCCESS;
+}
+
+EResult CImageRotating::Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
+      (pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
+      (pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
+      (pSrc->eFormat == VIDEO_FORMAT_ARGB)) {
+    eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
+                                  pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
+  } else if (pSrc->eFormat == VIDEO_FORMAT_I420) {
+    ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
+                        pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
+    ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[1], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
+                        (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[1]);
+    eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[2], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
+                                  (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[2]);
+  } else {
+    eReturn = RET_NOTSUPPORTED;
+  }
+
+  return eReturn;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/imagerotate/imagerotate.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  downsample.h
+ *
+ * \brief	    :  image rotate class of wels video processor class
+ *
+ * \date        :  2011/04/06
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_IMAGEROTATE_H
+#define WELSVP_IMAGEROTATE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (ImageRotateFunc) (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
+                                uint8_t* pDst);
+
+typedef ImageRotateFunc*		ImageRotateFuncPtr;
+
+ImageRotateFunc   ImageRotate90D_c;
+ImageRotateFunc   ImageRotate180D_c;
+ImageRotateFunc   ImageRotate270D_c;
+
+typedef struct {
+  ImageRotateFuncPtr		pfImageRotate90D;
+  ImageRotateFuncPtr		pfImageRotate180D;
+  ImageRotateFuncPtr		pfImageRotate270D;
+} SImageRotateFuncs;
+
+class CImageRotating : public IStrategy {
+ public:
+  CImageRotating (int32_t iCpuFlag);
+  ~CImageRotating();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ private:
+  void InitImageRotateFuncs (SImageRotateFuncs& pf, int32_t iCpuFlag);
+  EResult ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
+                              uint8_t* pDst);
+
+ private:
+  SImageRotateFuncs m_pfRotateImage;
+  int32_t          m_iCPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/imagerotate/imagerotatefuncs.cpp
@@ -1,0 +1,66 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  image_rotate.c
+ *
+ *  Created on 11-2-21.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void ImageRotate90D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+  for (uint32_t j = 0; j < iHeight; j++) {
+    for (uint32_t i = 0; i < iWidth; i++) {
+      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+        pDst[ (i * iHeight + iHeight - 1 - j)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
+    }
+  }
+}
+void ImageRotate180D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+  for (uint32_t j = 0; j < iHeight; j++) {
+    for (uint32_t i = 0; i < iWidth; i++) {
+      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+        pDst[ ((iHeight - 1 - j)*iWidth + iWidth - 1 - i)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
+    }
+  }
+}
+void ImageRotate270D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+  for (uint32_t j = 0; j < iWidth; j++) {
+    for (uint32_t i = 0; i < iHeight; i++) {
+      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+        pDst[ ((iWidth - 1 - j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[ (iWidth * i + j) * uiBytesPerPixel + n];
+    }
+  }
+}
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -1,0 +1,136 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define HIGH_MOTION_BLOCK_THRESHOLD 320
+#define SCENE_CHANGE_MOTION_RATIO	0.85f
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CSceneChangeDetection::CSceneChangeDetection (int32_t iCpuFlag) {
+  m_iCpuFlag = iCpuFlag;
+  m_eMethod   = METHOD_SCENE_CHANGE_DETECTION;
+  m_pfSad   = NULL;
+  WelsMemset (&m_sSceneChangeParam, 0, sizeof (m_sSceneChangeParam));
+  InitSadFuncs (m_pfSad, m_iCpuFlag);
+}
+
+CSceneChangeDetection::~CSceneChangeDetection() {
+}
+
+EResult CSceneChangeDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  EResult eReturn = RET_INVALIDPARAM;
+
+  int32_t iWidth                  = pSrcPixMap->sRect.iRectWidth;
+  int32_t iHeight                 = pSrcPixMap->sRect.iRectHeight;
+  int32_t iBlock8x8Width      = iWidth  >> 3;
+  int32_t iBlock8x8Height	 = iHeight >> 3;
+  int32_t iBlock8x8Num       = iBlock8x8Width * iBlock8x8Height;
+  int32_t iSceneChangeThreshold = WelsStaticCast (int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
+
+  int32_t iBlockSad = 0;
+  int32_t iMotionBlockNum = 0;
+
+  uint8_t* pRefY = NULL, *pCurY = NULL;
+  int32_t iRefStride = 0, iCurStride = 0;
+  int32_t iRefRowStride = 0, iCurRowStride = 0;
+
+  uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
+
+  pRefY = (uint8_t*)pRefPixMap->pPixel[0];
+  pCurY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+  iRefStride  = pRefPixMap->iStride[0];
+  iCurStride  = pSrcPixMap->iStride[0];
+
+  iRefRowStride  = pRefPixMap->iStride[0] << 3;
+  iCurRowStride  = pSrcPixMap->iStride[0] << 3;
+
+  m_sSceneChangeParam.bSceneChangeFlag = 0;
+
+  for (int32_t j = 0; j < iBlock8x8Height; j ++) {
+    pRefTmp	= pRefY;
+    pCurTmp 	= pCurY;
+
+    for (int32_t i = 0; i < iBlock8x8Width; i++) {
+      iBlockSad = m_pfSad (pRefTmp, iRefStride, pCurTmp, iCurStride);
+
+      iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
+
+      pRefTmp += 8;
+      pCurTmp += 8;
+    }
+
+    pRefY += iRefRowStride;
+    pCurY += iCurRowStride;
+  }
+
+  if (iMotionBlockNum >= iSceneChangeThreshold) {
+    m_sSceneChangeParam.bSceneChangeFlag = 1;
+  }
+
+  eReturn = RET_SUCCESS;
+
+  return eReturn;
+}
+
+
+EResult CSceneChangeDetection::Get (int32_t iType, void* pParam) {
+  if (pParam == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  * (SSceneChangeResult*)pParam = m_sSceneChangeParam;
+
+  return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad,  int32_t iCpuFlag) {
+  pfSad = WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+  if (iCpuFlag & WELS_CPU_SSE2) {
+    pfSad = WelsSampleSad8x8_sse21;
+  }
+#endif
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.h
@@ -1,0 +1,72 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file	        :  SceneChangeDetection.h
+*
+* \brief	    :  scene change detection class of wels video processor class
+*
+* \date         :  2011/03/14
+*
+* \description  :  1. rewrite the package code of scene change detection class
+*
+*************************************************************************************
+*/
+
+#ifndef WELSVP_SCENECHANGEDETECTION_H
+#define WELSVP_SCENECHANGEDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+#include "SceneChangeDetectionCommon.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+class CSceneChangeDetection : public IStrategy {
+ public:
+  CSceneChangeDetection (int32_t iCpuFlag);
+  ~CSceneChangeDetection();
+
+  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+  EResult Get (int32_t iType, void* pParam);
+
+ private:
+  void InitSadFuncs (SadFuncPtr& pfSadFunc, int32_t iCpuFlag);
+
+ private:
+  SadFuncPtr m_pfSad;
+  int32_t    m_iCpuFlag;
+  SSceneChangeResult m_sSceneChangeParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
@@ -1,0 +1,60 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetectionCommon.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+int32_t WelsSampleSad8x8_c (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY) {
+  int32_t iSadSum = 0;
+  uint8_t* pSrcA = pSrcY;
+  uint8_t* pSrcB = pRefY;
+  for (int32_t i = 0; i < 8; i++) {
+    iSadSum += WELS_ABS ((pSrcA[0] - pSrcB[0]));
+    iSadSum += WELS_ABS ((pSrcA[1] - pSrcB[1]));
+    iSadSum += WELS_ABS ((pSrcA[2] - pSrcB[2]));
+    iSadSum += WELS_ABS ((pSrcA[3] - pSrcB[3]));
+    iSadSum += WELS_ABS ((pSrcA[4] - pSrcB[4]));
+    iSadSum += WELS_ABS ((pSrcA[5] - pSrcB[5]));
+    iSadSum += WELS_ABS ((pSrcA[6] - pSrcB[6]));
+    iSadSum += WELS_ABS ((pSrcA[7] - pSrcB[7]));
+
+    pSrcA += iSrcStrideY;
+    pSrcB += iRefStrideY;
+  }
+
+  return iSadSum;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -1,0 +1,65 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	        :  SceneChangeDetectionCommon.h
+ *
+ * \brief	    :  scene change detection class of wels video processor class
+ *
+ * \date         :  2011/03/14
+ *
+ * \description  :  1. rewrite the package code of scene change detection class
+ *
+ */
+
+#ifndef WELSVP_SCENECHANGEDETECTIONCOMMON_H
+#define WELSVP_SCENECHANGEDETECTIONCOMMON_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef  int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
+
+typedef SadFunc*   SadFuncPtr;
+
+SadFunc      WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+SadFunc      WelsSampleSad8x8_sse21;
+WELSVP_EXTERN_C_END
+#endif
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/vaacalc/vaacalcfuncs.cpp
@@ -1,0 +1,595 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void VAACalcSadSsd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      pSum16x16[mb_index] = 0;
+      psqsum16x16[mb_index] = 0;
+      psqdiff16x16[mb_index] = 0;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sqdiff += diff * diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+void VAACalcSadVar_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sum, l_sqsum;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      pSum16x16[mb_index] = 0;
+      psqsum16x16[mb_index] = 0;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+      l_sad =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+
+void VAACalcSad_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                   int32_t* pFrameSad, int32_t* pSad8x8) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+
+      l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+          l_sad += diff;
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+void VAACalcSadSsdBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                         int32_t iPicStride,
+                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16, int32_t* pSd8x8,
+                         uint8_t* pMad8x8)
+
+{
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      pSum16x16[mb_index] = 0;
+      psqsum16x16[mb_index] = 0;
+      psqdiff16x16[mb_index] = 0;
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 0] = l_sd;
+      pMad8x8[ (mb_index << 2) + 0] = l_mad;
+
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 1] = l_sd;
+      pMad8x8[ (mb_index << 2) + 1] = l_mad;
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 2] = l_sd;
+      pMad8x8[ (mb_index << 2) + 2] = l_mad;
+
+      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+
+          l_sd += diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+          l_sad += abs_diff;
+          l_sqdiff += abs_diff * abs_diff;
+          l_sum += tmp_cur_row[l];
+          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSum16x16[mb_index] += l_sum;
+      psqsum16x16[mb_index] += l_sqsum;
+      psqdiff16x16[mb_index] += l_sqdiff;
+      pSd8x8[ (mb_index << 2) + 3] = l_sd;
+      pMad8x8[ (mb_index << 2) + 3] = l_mad;
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+void VAACalcSadBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8) {
+  uint8_t* tmp_ref = pRefData;
+  uint8_t* tmp_cur = pCurData;
+  int32_t iMbWidth = (iPicWidth >> 4);
+  int32_t mb_heigth = (iPicHeight >> 4);
+  int32_t mb_index = 0;
+  int32_t pic_stride_x8 = iPicStride << 3;
+  int32_t step = (iPicStride << 4) - iPicWidth;
+
+  *pFrameSad = 0;
+  for (int32_t i = 0; i < mb_heigth; i ++) {
+    for (int32_t j = 0; j < iMbWidth; j ++) {
+      int32_t k, l;
+      int32_t l_sad, l_sd, l_mad;
+      uint8_t* tmp_cur_row;
+      uint8_t* tmp_ref_row;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur;
+      tmp_ref_row = tmp_ref;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 0] = l_sad;
+      pSd8x8[ (mb_index << 2) + 0] = l_sd;
+      pMad8x8[ (mb_index << 2) + 0] = l_mad;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur + 8;
+      tmp_ref_row = tmp_ref + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 1] = l_sad;
+      pSd8x8[ (mb_index << 2) + 1] = l_sd;
+      pMad8x8[ (mb_index << 2) + 1] = l_mad;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8;
+      tmp_ref_row = tmp_ref + pic_stride_x8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 2] = l_sad;
+      pSd8x8[ (mb_index << 2) + 2] = l_sd;
+      pMad8x8[ (mb_index << 2) + 2] = l_mad;
+
+      l_mad = l_sd = l_sad =  0;
+      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+      for (k = 0; k < 8; k ++) {
+        for (l = 0; l < 8; l ++) {
+          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+          int32_t abs_diff = WELS_ABS (diff);
+          l_sd += diff;
+          l_sad += abs_diff;
+          if (abs_diff > l_mad) {
+            l_mad = abs_diff;
+          }
+        }
+        tmp_cur_row += iPicStride;
+        tmp_ref_row += iPicStride;
+      }
+      *pFrameSad += l_sad;
+      pSad8x8[ (mb_index << 2) + 3] = l_sad;
+      pSd8x8[ (mb_index << 2) + 3] = l_sd;
+      pMad8x8[ (mb_index << 2) + 3] = l_mad;
+
+      tmp_ref += 16;
+      tmp_cur += 16;
+      ++mb_index;
+    }
+    tmp_ref += step;
+    tmp_cur += step;
+  }
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -1,0 +1,123 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "vaacalculation.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CVAACalculation::CVAACalculation (int32_t iCpuFlag) {
+  m_iCPUFlag = iCpuFlag;
+  m_eMethod   = METHOD_VAA_STATISTICS;
+
+  WelsMemset (&m_sCalcParam, 0, sizeof (m_sCalcParam));
+  WelsMemset (&m_sVaaFuncs, 0, sizeof (m_sVaaFuncs));
+  InitVaaFuncs (m_sVaaFuncs, m_iCPUFlag);
+}
+
+CVAACalculation::~CVAACalculation() {
+}
+
+void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
+  sVaaFuncs.pfVAACalcSad				= VAACalcSad_c;
+  sVaaFuncs.pfVAACalcSadBgd			= VAACalcSadBgd_c;
+  sVaaFuncs.pfVAACalcSadSsd			= VAACalcSadSsd_c;
+  sVaaFuncs.pfVAACalcSadSsdBgd		= VAACalcSadSsdBgd_c;
+  sVaaFuncs.pfVAACalcSadVar			= VAACalcSadVar_c;
+#ifdef X86_ASM
+  if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
+   /* sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
+    sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
+    sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
+    sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;*/
+  }
+#endif//X86_ASM
+}
+
+EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+  uint8_t* pCurData	= (uint8_t*)pSrcPixMap->pPixel[0];
+  uint8_t* pRefData	= (uint8_t*)pRefPixMap->pPixel[0];
+  int32_t iPicWidth	= pSrcPixMap->sRect.iRectWidth;
+  int32_t iPicHeight	= pSrcPixMap->sRect.iRectHeight;
+  int32_t iPicStride	= pSrcPixMap->iStride[0];
+
+  SVAACalcResult* pResult = m_sCalcParam.pCalcResult;
+
+  if (pCurData == NULL || pRefData == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  pResult->pCurY = pCurData;
+  pResult->pRefY = pRefData;
+  if (m_sCalcParam.iCalcBgd) {
+    if (m_sCalcParam.iCalcSsd) {
+      m_sVaaFuncs.pfVAACalcSadSsdBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                      (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16,
+                                      (int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
+    } else {
+      m_sVaaFuncs.pfVAACalcSadBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                   (int32_t*) (pResult->pSad8x8), (int32_t*) (pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
+    }
+  } else {
+    if (m_sCalcParam.iCalcSsd) {
+      m_sVaaFuncs.pfVAACalcSadSsd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                   (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
+    } else {
+      if (m_sCalcParam.iCalcVar) {
+        m_sVaaFuncs.pfVAACalcSadVar (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                     (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
+      } else {
+        m_sVaaFuncs.pfVAACalcSad (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+                                  (int32_t*)pResult->pSad8x8);
+      }
+    }
+  }
+
+  return RET_SUCCESS;
+}
+
+EResult CVAACalculation::Set (int32_t iType, void* pParam) {
+  if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL) {
+    return RET_INVALIDPARAM;
+  }
+
+  m_sCalcParam = * (SVAACalcParam*)pParam;
+
+  return RET_SUCCESS;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -1,0 +1,125 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  vaacalculation.h
+ *
+ * \brief	    :  pVaa calculation class of wels video processor class
+ *
+ * \date        :  2011/03/18
+ *
+ * \description :  1. rewrite the package code of pVaa calculation class
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_VAACALCULATION_H
+#define WELSVP_VAACALCULATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VAACalcSadBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                  int32_t iPicStride,
+                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8);
+
+typedef void (VAACalcSadSsdBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                     int32_t iPicStride,
+                                     int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16,
+                                     int32_t* pSsd16x16, int32_t* pSd8x8, uint8_t* pMad8x8);
+
+typedef void (VAACalcSadFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                               int32_t iPicStride,
+                               int32_t* pFrameSad, int32_t* pSad8x8);
+
+typedef void (VAACalcSadVarFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                  int32_t iPicStride,
+                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16);
+
+typedef void (VAACalcSadSsdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+                                  int32_t iPicStride,
+                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16, int32_t* pSsd16x16);
+
+
+typedef VAACalcSadBgdFunc*		 PVAACalcSadBgdFunc;
+typedef VAACalcSadSsdBgdFunc*	 PVAACalcSadSsdBgdFunc;
+typedef VAACalcSadFunc*			 PVAACalcSadFunc;
+typedef VAACalcSadVarFunc*		 PVAACalcSadVarFunc;
+typedef VAACalcSadSsdFunc*		 PVAACalcSadSsdFunc;
+
+typedef  struct TagVaaFuncs {
+  PVAACalcSadBgdFunc		pfVAACalcSadBgd;
+  PVAACalcSadSsdBgdFunc	pfVAACalcSadSsdBgd;
+  PVAACalcSadFunc			pfVAACalcSad;
+  PVAACalcSadVarFunc		pfVAACalcSadVar;
+  PVAACalcSadSsdFunc		pfVAACalcSadSsd;
+} SVaaFuncs;
+
+
+VAACalcSadBgdFunc		VAACalcSadBgd_c;
+VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_c;
+VAACalcSadFunc			    VAACalcSad_c;
+VAACalcSadVarFunc		VAACalcSadVar_c;
+VAACalcSadSsdFunc		VAACalcSadSsd_c;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc		VAACalcSadBgd_sse2;
+VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_sse2;
+VAACalcSadFunc			    VAACalcSad_sse2;
+VAACalcSadVarFunc		VAACalcSadVar_sse2;
+VAACalcSadSsdFunc		VAACalcSadSsd_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+class CVAACalculation : public IStrategy {
+ public:
+  CVAACalculation (int32_t iCpuFlag);
+  ~CVAACalculation();
+
+  EResult Process (int32_t iType, SPixMap* pCurPixMap, SPixMap* pRefPixMap);
+  EResult Set (int32_t iType, void* pParam);
+
+ private:
+  void InitVaaFuncs (SVaaFuncs& sVaaFunc, int32_t iCpuFlag);
+
+ private:
+  SVaaFuncs      m_sVaaFuncs;
+  int32_t       m_iCPUFlag;
+  SVAACalcParam m_sCalcParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/targets.mk
@@ -1,0 +1,114 @@
+PROCESSING_PREFIX=PROCESSING
+PROCESSING_SRCDIR=codec/processing
+PROCESSING_CPP_SRCS=\
+	$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp\
+	$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp\
+	$(PROCESSING_SRCDIR)/./src/common/cpu.cpp\
+	$(PROCESSING_SRCDIR)/./src/common/memory.cpp\
+	$(PROCESSING_SRCDIR)/./src/common/thread.cpp\
+	$(PROCESSING_SRCDIR)/./src/common/util.cpp\
+	$(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp\
+	$(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp\
+	$(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp\
+	$(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp\
+	$(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp\
+	$(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp\
+	$(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp\
+	$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp\
+	$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp\
+	$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp\
+	$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp\
+	$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp\
+	$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp\
+
+PROCESSING_OBJS += $(PROCESSING_CPP_SRCS:.cpp=.o)
+ifeq ($(USE_ASM), Yes)
+PROCESSING_ASM_SRCS=\
+	$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
+	$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
+	$(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
+	$(PROCESSING_SRCDIR)/./src/asm/sad.asm\
+	$(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
+
+PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)
+endif
+
+OBJS += $(PROCESSING_OBJS)
+$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o: $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
+
+$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o: $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/cpu.o: $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/cpu.o $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/memory.o: $(PROCESSING_SRCDIR)/./src/common/memory.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/memory.o $(PROCESSING_SRCDIR)/./src/common/memory.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/thread.o: $(PROCESSING_SRCDIR)/./src/common/thread.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/thread.o $(PROCESSING_SRCDIR)/./src/common/thread.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/util.o: $(PROCESSING_SRCDIR)/./src/common/util.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/util.o $(PROCESSING_SRCDIR)/./src/common/util.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
+
+$(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o: $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
+
+$(PROCESSING_SRCDIR)/./src/denoise/denoise.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise.o $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
+
+$(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
+
+$(PROCESSING_SRCDIR)/./src/downsample/downsample.o: $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsample.o $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
+
+$(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o: $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
+
+$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
+
+$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
+
+$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
+
+$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
+
+$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
+
+$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
+	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
+
+$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o: $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o: $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/intra_pred.o: $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/sad.o: $(PROCESSING_SRCDIR)/./src/asm/sad.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/sad.o $(PROCESSING_SRCDIR)/./src/asm/sad.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/vaa.o: $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
+	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/vaa.o $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
+
+$(LIBPREFIX)processing.$(LIBSUFFIX): $(PROCESSING_OBJS)
+	rm -f $(LIBPREFIX)processing.$(LIBSUFFIX)
+	$(AR) cr $@ $(PROCESSING_OBJS)
+
+libraries: $(LIBPREFIX)processing.$(LIBSUFFIX)
+LIBRARIES += $(LIBPREFIX)processing.$(LIBSUFFIX)
--- a/processing/build/linux/makefile
+++ /dev/null
@@ -1,94 +1,0 @@
-NASM = 1
-NAME      = libwelsvp
-
-OUTDIR    = ../../../bin/linux
-BINDIR    = ../../bin
-OBJDIR    = ../../obj
-SRCDIRS   = ../../src/asm \
-            ../../src/common \
-            ../../src/adaptivequantization \
-            ../../src/backgounddetection \
-            ../../src/denoise \
-            ../../src/downsample \
-            ../../src/scenechangedetection \
-            ../../src/vaacalc \
-            ../../src/complexityanalysis
-SRCDIRS  += ../../src/imagerotate
-
-
-TARGETLIB =  $(BINDIR)/$(NAME).so
-
-CC        = $(shell which gcc)
-AS        = $(shell which nasm)
-GCC       = gcc -m32
-
-CPPFLAGS  = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
-LDFLAGS   = -lstdc++ -ldl
-
-SRCEXTS  = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS  = .h
-SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP  = $(filter %.cpp,$(SOURCES))
-SRC_ASM  = $(filter %.asm,$(SOURCES))
-OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS     = $(OBJS:.o=.d)
-
-DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
-                  echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm   = $(AS)  $(ASMFLAGS)
-LINK          = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-
-%.d:%.cpp
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_cpp.d) $< >> $@
-
-%.d:%.asm
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
-	$(COMPILE.cpp) $< -o $@
-
-%.o:%.asm
-	$(COMPILE.asm) $< -o $@
-
-tags: $(HEADERS) $(SOURCES)
-	etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
-	ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
-	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
-	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
-	@echo produce the lib to $(TARGETLIB).
-	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
-	@cp -f $(TARGETLIB) $(OUTDIR)
-	@cp -f $(TARGETLIB) ../../../testbin
-	@echo copy the lib to $(OUTDIR).
-
-clean:
-	rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
-	rm -f $(DEPS) TAGS
-
--- a/processing/build/win32/WelsVP_2008.sln
+++ /dev/null
@@ -1,20 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual Studio 2008
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
binary files a/processing/build/win32/WelsVP_2008.suo /dev/null differ
--- a/processing/build/win32/WelsVP_2008.vcproj
+++ /dev/null
@@ -1,900 +1,0 @@
-<?xml version="1.0" encoding="gb2312"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="9.00"
-	Name="WelsVP"
-	ProjectGUID="{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-	RootNamespace="WelsVP"
-	Keyword="Win32Proj"
-	TargetFrameworkVersion="196613"
-	>
-	<Platforms>
-		<Platform
-			Name="Win32"
-		/>
-		<Platform
-			Name="x64"
-		/>
-	</Platforms>
-	<ToolFiles>
-		<DefaultToolFile
-			FileName="masm.rules"
-		/>
-	</ToolFiles>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory=".\..\..\..\bin\win32\Debug"
-			IntermediateDirectory=".\..\..\..\obj\vp\Debug"
-			ConfigurationType="2"
-			CharacterSet="1"
-			WholeProgramOptimization="0"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-				CommandLine=""
-			/>
-			<Tool
-				Name="MASM"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories=""
-				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="1"
-				UsePrecompiledHeader="0"
-				AssemblerListingLocation=""
-				WarningLevel="3"
-				DebugInformationFormat="4"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				LinkLibraryDependencies="true"
-				OutputFile="$(OutDir)\welsvp.dll"
-				LinkIncremental="2"
-				ModuleDefinitionFile="../../src/common/WelsVP.def"
-				GenerateDebugInformation="true"
-				GenerateMapFile="true"
-				MapFileName="$(OutDir)\welsvp.map"
-				SubSystem="2"
-				TargetMachine="1"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-				CommandLine=""
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory=".\..\..\..\bin\win32\Release"
-			IntermediateDirectory=".\..\..\..\obj\vp\Release"
-			ConfigurationType="2"
-			CharacterSet="1"
-			WholeProgramOptimization="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-				CommandLine=""
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-				CommandLine=""
-			/>
-			<Tool
-				Name="MASM"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="3"
-				EnableIntrinsicFunctions="false"
-				FavorSizeOrSpeed="1"
-				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
-				RuntimeLibrary="0"
-				EnableFunctionLevelLinking="false"
-				UsePrecompiledHeader="0"
-				WarningLevel="3"
-				DebugInformationFormat="0"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				OutputFile="$(OutDir)\welsvp.dll"
-				LinkIncremental="1"
-				GenerateManifest="false"
-				EnableUAC="false"
-				ModuleDefinitionFile="../../src/common/WelsVP.def"
-				GenerateDebugInformation="false"
-				GenerateMapFile="false"
-				MapFileName=""
-				MapExports="false"
-				SubSystem="2"
-				OptimizeReferences="2"
-				EnableCOMDATFolding="2"
-				TargetMachine="1"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-				CommandLine=""
-			/>
-		</Configuration>
-		<Configuration
-			Name="Debug|x64"
-			OutputDirectory=".\..\..\..\bin\win32\Debug"
-			IntermediateDirectory=".\..\..\..\obj\vp\Debug"
-			ConfigurationType="2"
-			CharacterSet="1"
-			WholeProgramOptimization="0"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-				CommandLine=""
-			/>
-			<Tool
-				Name="MASM"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-				TargetEnvironment="3"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories=""
-				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="1"
-				UsePrecompiledHeader="0"
-				AssemblerListingLocation=""
-				WarningLevel="3"
-				DebugInformationFormat="3"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				LinkLibraryDependencies="true"
-				OutputFile="$(OutDir)\welsvp.dll"
-				LinkIncremental="2"
-				ModuleDefinitionFile="../../src/common/WelsVP.def"
-				GenerateDebugInformation="true"
-				GenerateMapFile="true"
-				MapFileName="$(OutDir)\welsvp.map"
-				SubSystem="2"
-				TargetMachine="17"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-				CommandLine=""
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release|x64"
-			OutputDirectory=".\..\..\..\bin\win64\Release"
-			IntermediateDirectory=".\..\..\..\obj\vp\Release"
-			ConfigurationType="2"
-			CharacterSet="1"
-			WholeProgramOptimization="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-				CommandLine=""
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-				CommandLine=""
-			/>
-			<Tool
-				Name="MASM"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-				TargetEnvironment="3"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="3"
-				EnableIntrinsicFunctions="false"
-				FavorSizeOrSpeed="1"
-				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS"
-				RuntimeLibrary="0"
-				EnableFunctionLevelLinking="false"
-				UsePrecompiledHeader="0"
-				WarningLevel="3"
-				DebugInformationFormat="0"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				OutputFile="$(OutDir)\welsvp.dll"
-				LinkIncremental="1"
-				GenerateManifest="false"
-				EnableUAC="false"
-				ModuleDefinitionFile="../../src/common/WelsVP.def"
-				GenerateDebugInformation="false"
-				GenerateMapFile="false"
-				MapFileName=""
-				MapExports="false"
-				SubSystem="2"
-				OptimizeReferences="2"
-				EnableCOMDATFolding="2"
-				TargetMachine="17"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-				CommandLine=""
-			/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
-			>
-			<File
-				RelativePath="..\..\src\common\cpu.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\memory.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\thread.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\util.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\WelsFrameWork.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\WelsFrameWorkEx.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Interface"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
-			>
-			<File
-				RelativePath="..\..\interface\IWelsVP.h"
-				>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					>
-					<Tool
-						Name="VCCLCompilerTool"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\common\resource.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
-			>
-			<File
-				RelativePath="..\..\src\common\WelsVP.def"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\WelsVP.rc"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			>
-			<File
-				RelativePath="..\..\src\common\cpu.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\memory.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\thread.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\typedef.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\util.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\version.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\common\WelsFrameWork.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="ASM"
-			>
-			<File
-				RelativePath="..\..\src\asm\asm_inc.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\cpuid.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\denoisefilter.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm   -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm   -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\downsample_bilinear.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\intra_pred.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\sad.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\src\asm\vaa.asm"
-				>
-				<FileConfiguration
-					Name="Debug|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|Win32"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Release|x64"
-					ExcludedFromBuild="true"
-					>
-					<Tool
-						Name="VCCustomBuildTool"
-						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
-						Outputs="$(IntDir)\$(InputName).obj"
-					/>
-				</FileConfiguration>
-			</File>
-		</Filter>
-		<Filter
-			Name="SceneChangeDetection"
-			>
-			<File
-				RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Denoise"
-			>
-			<File
-				RelativePath="..\..\src\denoise\denoise.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\denoise\denoise.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\denoise\denoise_filter.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="VAACalc"
-			>
-			<File
-				RelativePath="..\..\src\vaacalc\vaacalcfuncs.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\vaacalc\vaacalculation.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\vaacalc\vaacalculation.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="BackgroundDetection"
-			>
-			<File
-				RelativePath="..\..\src\backgounddetection\BackgroundDetection.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\backgounddetection\BackgroundDetection.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="AdaptiveQuantization"
-			>
-			<File
-				RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Downsample"
-			>
-			<File
-				RelativePath="..\..\src\downsample\downsample.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\downsample\downsample.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\downsample\downsamplefuncs.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="ComplexityAnalysis"
-			>
-			<File
-				RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="ImageRotate"
-			>
-			<File
-				RelativePath="..\..\src\imagerotate\imagerotate.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\imagerotate\imagerotate.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\imagerotate\imagerotatefuncs.cpp"
-				>
-			</File>
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
--- a/processing/build/win32/WelsVP_2010.sln
+++ /dev/null
@@ -1,20 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
binary files a/processing/build/win32/WelsVP_2010.suo /dev/null differ
--- a/processing/build/win32/WelsVP_2010.vcxproj
+++ /dev/null
@@ -1,435 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
-    <RootNamespace>WelsVP</RootNamespace>
-    <Keyword>Win32Proj</Keyword>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>false</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>false</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup>
-    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\bin\win32\Debug\</OutDir>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">.\..\..\..\bin\win64\Debug\</OutDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\vp\Debug\</IntDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">.\..\..\..\obj\vp\Debug\</IntDir>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\bin\win32\Release\</OutDir>
-    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.\..\..\..\bin\win64\Release\</OutDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\vp\Release\</IntDir>
-    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.\..\..\..\obj\vp\Release\</IntDir>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
-    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
-    <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</GenerateManifest>
-    <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</GenerateManifest>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
-    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
-    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
-    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsvp</TargetName>
-    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">welsvp</TargetName>
-    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsvp</TargetName>
-    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">welsvp</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <CustomBuildStep>
-      <Command>
-      </Command>
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <MinimalRebuild>true</MinimalRebuild>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <AssemblerListingLocation>
-      </AssemblerListingLocation>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-    </ClCompile>
-    <ProjectReference>
-      <LinkLibraryDependencies>true</LinkLibraryDependencies>
-    </ProjectReference>
-    <Link>
-      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <SubSystem>Windows</SubSystem>
-      <TargetMachine>MachineX86</TargetMachine>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <CustomBuildStep>
-      <Command>
-      </Command>
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <AssemblerListingLocation>
-      </AssemblerListingLocation>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-    </ClCompile>
-    <ProjectReference>
-      <LinkLibraryDependencies>true</LinkLibraryDependencies>
-    </ProjectReference>
-    <Link>
-      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <SubSystem>Windows</SubSystem>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <PreBuildEvent>
-      <Command>
-      </Command>
-    </PreBuildEvent>
-    <CustomBuildStep>
-      <Command>
-      </Command>
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Full</Optimization>
-      <IntrinsicFunctions>false</IntrinsicFunctions>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <FunctionLevelLinking>false</FunctionLevelLinking>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>
-      </DebugInformationFormat>
-    </ClCompile>
-    <Link>
-      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
-      <EnableUAC>false</EnableUAC>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>false</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <MapExports>true</MapExports>
-      <SubSystem>Windows</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <TargetMachine>MachineX86</TargetMachine>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <PreBuildEvent>
-      <Command>
-      </Command>
-    </PreBuildEvent>
-    <CustomBuildStep>
-      <Command>
-      </Command>
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Full</Optimization>
-      <IntrinsicFunctions>false</IntrinsicFunctions>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <FunctionLevelLinking>false</FunctionLevelLinking>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>
-      </DebugInformationFormat>
-    </ClCompile>
-    <Link>
-      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
-      <EnableUAC>false</EnableUAC>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>false</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <MapExports>true</MapExports>
-      <SubSystem>Windows</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\common\cpu.cpp" />
-    <ClCompile Include="..\..\src\common\memory.cpp" />
-    <ClCompile Include="..\..\src\common\thread.cpp" />
-    <ClCompile Include="..\..\src\common\util.cpp" />
-    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
-    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
-    <ClCompile Include="..\..\src\denoise\denoise.cpp" />
-    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
-    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
-    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
-    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
-    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
-    <ClCompile Include="..\..\src\downsample\downsample.cpp" />
-    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
-    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
-    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
-    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\interface\IWelsVP.h" />
-    <ClInclude Include="..\..\src\common\resource.h" />
-    <ClInclude Include="..\..\src\common\cpu.h" />
-    <ClInclude Include="..\..\src\common\memory.h" />
-    <ClInclude Include="..\..\src\common\thread.h" />
-    <ClInclude Include="..\..\src\common\typedef.h" />
-    <ClInclude Include="..\..\src\common\util.h" />
-    <ClInclude Include="..\..\src\common\version.h" />
-    <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
-    <ClInclude Include="..\..\src\denoise\denoise.h" />
-    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
-    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
-    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
-    <ClInclude Include="..\..\src\downsample\downsample.h" />
-    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
-    <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\src\common\WelsVP.def" />
-  </ItemGroup>
-  <ItemGroup>
-    <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\cpuid.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\sad.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\vaa.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVP_2010.vcxproj.filters
+++ /dev/null
@@ -1,165 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <ClCompile Include="..\..\interface\IWelsVP.h">
-      <Filter>headers</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\util.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\cpu.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\denoise\denoise.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\downsample\downsample.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\memory.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\thread.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\cpu.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\denoise\denoise.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\downsample\downsample.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\memory.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\resource.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\thread.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\typedef.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\util.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\version.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\WelsFrameWork.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\cpuid.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\sad.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\vaa.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <Filter Include="ASM">
-      <UniqueIdentifier>{ecef07b7-65e1-45c4-9afc-39f7b07992a2}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="headers">
-      <UniqueIdentifier>{be24742a-75fa-49a4-b77e-a69d626d46c8}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="sources">
-      <UniqueIdentifier>{9f4c2bd3-e8d2-4276-adc6-273c0031971a}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="resources">
-      <UniqueIdentifier>{322f1cbe-435f-402b-8d86-71d023d5d407}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\src\common\WelsVP.def">
-      <Filter>resources</Filter>
-    </None>
-  </ItemGroup>
-  <ItemGroup>
-    <ResourceCompile Include="..\..\src\common\WelsVP.rc">
-      <Filter>resources</Filter>
-    </ResourceCompile>
-  </ItemGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVP_2012.sln
+++ /dev/null
@@ -1,20 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2012
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2012", "WelsVP_2012.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
binary files a/processing/build/win32/WelsVP_2012.v11.suo /dev/null differ
--- a/processing/build/win32/WelsVP_2012.vcxproj
+++ /dev/null
@@ -1,427 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
-    <RootNamespace>WelsVP</RootNamespace>
-    <Keyword>Win32Proj</Keyword>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <PlatformToolset>v110</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <PlatformToolset>v110</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <PlatformToolset>v110</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>false</WholeProgramOptimization>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <PlatformToolset>v110</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-    <WholeProgramOptimization>false</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup>
-    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <OutDir>.\..\..\..\bin\win32\Debug\</OutDir>
-    <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
-    <LinkIncremental>true</LinkIncremental>
-    <TargetName>welsvp</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <TargetName>welsvp</TargetName>
-    <OutDir>.\..\..\..\bin\win64\Debug\</OutDir>
-    <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <OutDir>.\..\..\..\bin\win32\Release\</OutDir>
-    <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
-    <LinkIncremental>false</LinkIncremental>
-    <GenerateManifest>false</GenerateManifest>
-    <TargetName>welsvp</TargetName>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
-    <GenerateManifest>false</GenerateManifest>
-    <TargetName>welsvp</TargetName>
-    <OutDir>.\..\..\..\bin\win64\Release\</OutDir>
-    <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <CustomBuildStep>
-      <Command />
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <MinimalRebuild>true</MinimalRebuild>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-      <PrecompiledHeader />
-      <AssemblerListingLocation />
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
-    </ClCompile>
-    <ProjectReference>
-      <LinkLibraryDependencies>true</LinkLibraryDependencies>
-    </ProjectReference>
-    <Link>
-      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <SubSystem>Windows</SubSystem>
-      <TargetMachine>MachineX86</TargetMachine>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <CustomBuildStep>
-      <Command>
-      </Command>
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <AssemblerListingLocation>
-      </AssemblerListingLocation>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
-    </ClCompile>
-    <ProjectReference>
-      <LinkLibraryDependencies>true</LinkLibraryDependencies>
-    </ProjectReference>
-    <Link>
-      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <SubSystem>Windows</SubSystem>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <PreBuildEvent>
-      <Command>
-      </Command>
-    </PreBuildEvent>
-    <CustomBuildStep>
-      <Command />
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Full</Optimization>
-      <IntrinsicFunctions>false</IntrinsicFunctions>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <FunctionLevelLinking>false</FunctionLevelLinking>
-      <PrecompiledHeader />
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat />
-    </ClCompile>
-    <Link>
-      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
-      <EnableUAC>false</EnableUAC>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>false</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <MapExports>true</MapExports>
-      <SubSystem>Windows</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <TargetMachine>MachineX86</TargetMachine>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <PreBuildEvent>
-      <Command>
-      </Command>
-    </PreBuildEvent>
-    <CustomBuildStep>
-      <Command>
-      </Command>
-    </CustomBuildStep>
-    <ClCompile>
-      <Optimization>Full</Optimization>
-      <IntrinsicFunctions>false</IntrinsicFunctions>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-      <FunctionLevelLinking>false</FunctionLevelLinking>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <DebugInformationFormat>
-      </DebugInformationFormat>
-    </ClCompile>
-    <Link>
-      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
-      <EnableUAC>false</EnableUAC>
-      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
-      <GenerateDebugInformation>false</GenerateDebugInformation>
-      <GenerateMapFile>true</GenerateMapFile>
-      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
-      <MapExports>true</MapExports>
-      <SubSystem>Windows</SubSystem>
-      <OptimizeReferences>true</OptimizeReferences>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
-      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
-      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
-    </Link>
-    <PostBuildEvent>
-      <Command>
-      </Command>
-    </PostBuildEvent>
-    <Bscmake>
-      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
-    </Bscmake>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\src\common\cpu.cpp" />
-    <ClCompile Include="..\..\src\common\memory.cpp" />
-    <ClCompile Include="..\..\src\common\thread.cpp" />
-    <ClCompile Include="..\..\src\common\util.cpp" />
-    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
-    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
-    <ClCompile Include="..\..\src\denoise\denoise.cpp" />
-    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
-    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
-    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
-    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
-    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
-    <ClCompile Include="..\..\src\downsample\downsample.cpp" />
-    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
-    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
-    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
-    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\interface\IWelsVP.h" />
-    <ClInclude Include="..\..\src\common\resource.h" />
-    <ClInclude Include="..\..\src\common\cpu.h" />
-    <ClInclude Include="..\..\src\common\memory.h" />
-    <ClInclude Include="..\..\src\common\thread.h" />
-    <ClInclude Include="..\..\src\common\typedef.h" />
-    <ClInclude Include="..\..\src\common\util.h" />
-    <ClInclude Include="..\..\src\common\version.h" />
-    <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
-    <ClInclude Include="..\..\src\denoise\denoise.h" />
-    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
-    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
-    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
-    <ClInclude Include="..\..\src\downsample\downsample.h" />
-    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
-    <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\src\common\WelsVP.def" />
-  </ItemGroup>
-  <ItemGroup>
-    <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\cpuid.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\sad.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\vaa.asm">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-    </CustomBuild>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVP_2012.vcxproj.filters
+++ /dev/null
@@ -1,165 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <ClCompile Include="..\..\interface\IWelsVP.h">
-      <Filter>headers</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\cpu.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\denoise\denoise.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\downsample\downsample.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\memory.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\thread.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\util.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
-      <Filter>sources</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\cpu.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\denoise\denoise.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\downsample\downsample.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\memory.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\resource.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\thread.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\typedef.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\util.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\version.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\common\WelsFrameWork.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\cpuid.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\sad.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-    <CustomBuild Include="..\..\src\asm\vaa.asm">
-      <Filter>ASM</Filter>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <Filter Include="ASM">
-      <UniqueIdentifier>{18a2a593-cf54-452e-bf69-5eaf9aac6518}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="headers">
-      <UniqueIdentifier>{5a921557-4f54-4838-80de-8c517b1d099b}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="sources">
-      <UniqueIdentifier>{0b628696-109b-4a2e-b11f-5e9e006b76ae}</UniqueIdentifier>
-    </Filter>
-    <Filter Include="resources">
-      <UniqueIdentifier>{94dba5f3-1b39-4ccd-891b-6a70cb59f210}</UniqueIdentifier>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ResourceCompile Include="..\..\src\common\WelsVP.rc">
-      <Filter>resources</Filter>
-    </ResourceCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <None Include="..\..\src\common\WelsVP.def">
-      <Filter>resources</Filter>
-    </None>
-  </ItemGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVideoProcessor.sln
+++ /dev/null
@@ -1,29 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual Studio 2008
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVideoProcessor", "WelsVideoProcessor.vcproj", "{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
-	ProjectSection(ProjectDependencies) = postProject
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.ActiveCfg = Debug|Win32
-		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.Build.0 = Debug|Win32
-		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.ActiveCfg = Release|Win32
-		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.Build.0 = Release|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
-		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
--- a/processing/build/win32/WelsVideoProcessor.vcproj
+++ /dev/null
@@ -1,213 +1,0 @@
-<?xml version="1.0" encoding="gb2312"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="9.00"
-	Name="WelsVideoProcessor"
-	ProjectGUID="{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
-	RootNamespace="WelsVideoProcessor"
-	Keyword="Win32Proj"
-	TargetFrameworkVersion="196613"
-	>
-	<Platforms>
-		<Platform
-			Name="Win32"
-		/>
-	</Platforms>
-	<ToolFiles>
-	</ToolFiles>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
-			IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
-			ConfigurationType="1"
-			CharacterSet="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="1"
-				UsePrecompiledHeader="0"
-				WarningLevel="3"
-				DebugInformationFormat="4"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				LinkIncremental="2"
-				GenerateDebugInformation="true"
-				SubSystem="1"
-				TargetMachine="1"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
-			IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
-			ConfigurationType="1"
-			CharacterSet="1"
-			WholeProgramOptimization="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				EnableIntrinsicFunctions="true"
-				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
-				RuntimeLibrary="0"
-				EnableFunctionLevelLinking="true"
-				UsePrecompiledHeader="0"
-				WarningLevel="3"
-				DebugInformationFormat="3"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				LinkIncremental="1"
-				GenerateDebugInformation="true"
-				SubSystem="1"
-				OptimizeReferences="2"
-				EnableCOMDATFolding="2"
-				TargetMachine="1"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
-			>
-			<File
-				RelativePath="..\..\src\testbed\stdafx.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\testbed\wels_process.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\testbed\WelsVideoProcessor.cpp"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
-			>
-			<File
-				RelativePath="..\..\src\testbed\stdafx.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\testbed\targetver.h"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\testbed\wels_process.h"
-				>
-			</File>
-		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
-			>
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
--- a/processing/interface/IWelsVP.h
+++ /dev/null
@@ -1,286 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2004-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	    :  IWelsVP.h
- *
- * \brief	    :  Interface of wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :  1. should support both C/C++ style interface
- *                 2. should concern with the feature extension requirement
- *                 3. should care the usage of "char"==>
- *                     1) value char  : signed char/unsigned char
- *                     2) string char : char
- *
- *************************************************************************************
- */
-
-#ifndef IWELSVP_H_
-#define IWELSVP_H_
-
-#ifdef _WIN32
-#define WELSAPI __stdcall
-#else
-#define WELSAPI
-#endif
-
-#define WELSVP_MAJOR_VERSION   1
-#define WELSVP_MINOR_VERSION   1
-#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
-
-typedef enum {
-  RET_SUCCESS          =  0,
-  RET_FAILED           = -1,
-  RET_INVALIDPARAM     = -2,
-  RET_OUTOFMEMORY      = -3,
-  RET_NOTSUPPORTED       = -4,
-  RET_UNEXPECTED       = -5,
-  RET_NEEDREINIT		  = -6
-} EResult;
-
-typedef enum {
-  VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
-  /*rgb color formats*/
-  VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
-  VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
-  VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
-  VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
-  VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
-  VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
-  VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
-  VIDEO_FORMAT_ARGB       = 8,   /* argb             */
-
-  /*yuv color formats*/
-  VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
-  VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
-  VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
-  VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */
-  VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
-  VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */
-  VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
-  VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
-  VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
-  VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
-
-  VIDEO_FORMAT_RGB24      = 1,
-  VIDEO_FORMAT_RGB32      = 2,
-  VIDEO_FORMAT_RGB24_INV  = 5,
-  VIDEO_FORMAT_RGB32_INV  = 6,
-  VIDEO_FORMAT_RGB555_INV = 7,
-  VIDEO_FORMAT_RGB565_INV = 8,
-  VIDEO_FORMAT_YUV2       = 21,
-  VIDEO_FORMAT_420        = 23,
-
-  VIDEO_FORMAT_VFlip      = 0x80000000
-} EVideoFormat;
-
-typedef enum {
-  BUFFER_HOSTMEM  = 0,
-  BUFFER_SURFACE
-} EPixMapBufferProperty;
-
-typedef struct {
-  int iRectTop;
-  int iRectLeft;
-  int iRectWidth;
-  int iRectHeight;
-} SRect;
-
-typedef struct {
-  void*        pPixel[3];
-  int          iSizeInBits;
-  int          iStride[3];
-  SRect        sRect;
-  EVideoFormat eFormat;
-  EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
-} SPixMap;
-
-typedef enum {
-  METHOD_NULL              = 0,
-  METHOD_COLORSPACE_CONVERT    ,//not support yet
-  METHOD_DENOISE              ,
-  METHOD_SCENE_CHANGE_DETECTION ,
-  METHOD_DOWNSAMPLE			  ,
-  METHOD_VAA_STATISTICS        ,
-  METHOD_BACKGROUND_DETECTION  ,
-  METHOD_ADAPTIVE_QUANT ,
-  METHOD_COMPLEXITY_ANALYSIS   ,
-  METHOD_IMAGE_ROTATE		  ,
-  METHOD_MASK
-} EMethods;
-
-//-----------------------------------------------------------------//
-//  Algorithm parameters define
-//-----------------------------------------------------------------//
-
-typedef struct {
-  int bSceneChangeFlag; // 0:false ; 1:true
-} SSceneChangeResult;
-
-typedef enum {
-  SIMILAR_SCENE,      //similar scene
-  MEDIUM_CHANGED_SCENE,   //medium changed scene
-  LARGE_CHANGED_SCENE,   //large changed scene
-} ESceneChangeIdc;
-
-typedef struct {
-  unsigned char* pCurY;					// Y data of current frame
-  unsigned char* pRefY;					// Y data of pRef frame for diff calc
-  int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
-  int* pSsd16x16;					// sum of square difference of 16x16
-  int* pSum16x16;					// sum of 16x16
-  int* pSumOfSquare16x16;					// sum of square of 16x16
-  int	(*pSumOfDiff8x8)[4];
-  unsigned char (*pMad8x8)[4];
-  int iFrameSad;					// sad of frame
-} SVAACalcResult;
-
-typedef struct {
-  int iCalcVar;
-  int iCalcBgd;
-  int iCalcSsd;
-  int iReserved;
-  SVAACalcResult*	pCalcResult;
-} SVAACalcParam;
-
-typedef struct {
-  signed char*		pBackgroundMbFlag;
-  SVAACalcResult*  pCalcRes;
-} SBGDInterface;
-
-typedef enum {
-  AQ_QUALITY_MODE,   //Quality mode
-  AQ_BITRATE_MODE,   //Bitrate mode
-} EAQModes;
-
-typedef struct {
-  unsigned short    uiMotionIndex;
-  unsigned short    uiTextureIndex;
-} SMotionTextureUnit;
-
-typedef struct {
-  int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
-  SVAACalcResult*		pCalcResult;
-  SMotionTextureUnit*  pMotionTextureUnit;
-
-  signed char*			pMotionTextureIndexToDeltaQp;
-  double				dAverMotionTextureIndexToDeltaQp;
-} SAdaptiveQuantizationParam;
-
-typedef enum {
-  FRAME_SAD     =  0,
-  GOM_SAD       = -1,
-  GOM_VAR       = -2
-} EComplexityAnalysisMode;
-
-typedef struct {
-  int  iComplexityAnalysisMode;
-  int  iCalcBgd;
-  int  iMbNumInGom;
-  int  iFrameComplexity;
-  int*  pGomComplexity;
-  int*  pGomForegroundBlockNum;
-  signed char*  pBackgroundMbFlag;
-  unsigned int* uiRefMbType;
-  SVAACalcResult*  pCalcResult;
-} SComplexityAnalysisParam;
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-
-typedef struct {
-  void*    pCtx;
-  EResult (*Init) (void* pCtx, int iType, void* pCfg);
-  EResult (*Uninit) (void* pCtx, int iType);
-  EResult (*Flush) (void* pCtx, int iType);
-  EResult (*Process) (void* pCtx, int iType, SPixMap* pSrc, SPixMap* dst);
-  EResult (*Get) (void* pCtx, int iType, void* pParam);
-  EResult (*Set) (void* pCtx, int iType, void* pParam);
-  EResult (*SpecialFeature) (void* pCtx, int iType, void* pIn, void* pOut);
-} IWelsVPc;
-
-#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
-
-class IWelsVP {
- public:
-  virtual ~IWelsVP() {}
-
- public:
-  virtual EResult Init (int iType, void* pCfg) = 0;
-  virtual EResult Uninit (int iType) = 0;
-  virtual EResult Flush (int iType) = 0;
-  virtual EResult Process (int iType, SPixMap* pSrc, SPixMap* dst) = 0;
-  virtual EResult Get (int iType, void* pParam) = 0;
-  virtual EResult Set (int iType, void* pParam) = 0;
-  virtual EResult SpecialFeature (int iType, void* pIn, void* pOut) = 0;
-};
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)
-#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)
-#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)
-#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)
-#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)
-#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)
-#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
-
-/* C++ interface version */
-#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff))
-#define WELSVP_EXTERNC_BEGIN                       extern "C" {
-#define WELSVP_EXTERNC_END                         }
-
-#else    /* C style interface */
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)
-#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)
-#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)
-#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)
-#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)
-#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)
-#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c)
-
-/* C interface version */
-#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff))
-#define WELSVP_EXTERNC_BEGIN
-#define WELSVP_EXTERNC_END
-
-#endif
-
-WELSVP_EXTERNC_BEGIN
-EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
-EResult WELSAPI DestroyVpInterface (void* pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
-WELSVP_EXTERNC_END
-
-//////////////////////////////////////////////////////////////////////////////////////////////
-#endif // IWELSVP_H_
-
-
--- a/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ /dev/null
@@ -1,256 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include "AdaptiveQuantization.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-
-#define AVERAGE_TIME_MOTION                   (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
-#define AVERAGE_TIME_TEXTURE_QUALITYMODE  (1.0) //0.5 // 1/2
-#define AVERAGE_TIME_TEXTURE_BITRATEMODE  (0.875) //0.5 // 1/2
-#define MODEL_ALPHA                           (0.9910) //1.5 //1.1102
-#define MODEL_TIME                            (5.8185) //9.0 //5.9842
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
-  m_CPUFlag = iCpuFlag;
-  m_eMethod   = METHOD_ADAPTIVE_QUANT;
-  m_pfVar   = NULL;
-  WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
-  WelsInitVarFunc (m_pfVar, m_CPUFlag);
-}
-
-CAdaptiveQuantization::~CAdaptiveQuantization() {
-}
-
-EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  EResult eReturn = RET_INVALIDPARAM;
-
-  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
-  int32_t iMbWidth  = iWidth  >> 4;
-  int32_t iMbHeight = iHeight >> 4;
-  int32_t iMbTotalNum    = iMbWidth * iMbHeight;
-
-  SMotionTextureUnit* pMotionTexture = NULL;
-  SVAACalcResult*     pVaaCalcResults = NULL;
-  int8_t   iMotionTextureIndexToDeltaQp = 0;
-  int32_t	 iAverMotionTextureIndexToDeltaQp = 0;	// double to uint32
-  double_t dAverageMotionIndex = 0.0;	// double to float
-  double_t dAverageTextureIndex = 0.0;
-
-  double_t dQStep = 0.0;
-  double_t dLumaMotionDeltaQp = 0;
-  double_t dLumaTextureDeltaQp = 0;
-
-  uint8_t* pRefFrameY = NULL, *pCurFrameY = NULL;
-  int32_t iRefStride = 0, iCurStride = 0;
-
-  uint8_t* pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
-  int32_t i = 0, j = 0;
-
-  pRefFrameY = (uint8_t*)pRefPixMap->pPixel[0];
-  pCurFrameY = (uint8_t*)pSrcPixMap->pPixel[0];
-
-  iRefStride  = pRefPixMap->iStride[0];
-  iCurStride  = pSrcPixMap->iStride[0];
-
-  /////////////////////////////////////// motion //////////////////////////////////
-  //  motion MB residual variance
-  dAverageMotionIndex = 0.0;
-  dAverageTextureIndex = 0.0;
-  pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
-  pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
-
-  if (pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY) {
-    int32_t iMbIndex = 0;
-    int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
-    for (j = 0; j < iMbHeight; j ++) {
-      pRefFrameTmp  = pRefFrameY;
-      pCurFrameTmp  = pCurFrameY;
-      for (i = 0; i < iMbWidth; i++) {
-        iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
-        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
-        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
-        iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
-
-        iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
-        uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
-        iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
-
-        iSumDiff = iSumDiff >> 8;
-        pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
-
-        uiSum = uiSum >> 8;
-        pMotionTexture->uiTextureIndex = (iSQSum >> 8) - (uiSum * uiSum);
-
-        dAverageMotionIndex += pMotionTexture->uiMotionIndex;
-        dAverageTextureIndex += pMotionTexture->uiTextureIndex;
-        pMotionTexture++;
-        ++iMbIndex;
-        pRefFrameTmp += MB_WIDTH_LUMA;
-        pCurFrameTmp += MB_WIDTH_LUMA;
-      }
-      pRefFrameY += (iRefStride) << 4;
-      pCurFrameY += (iCurStride) << 4;
-    }
-  } else {
-    for (j = 0; j < iMbHeight; j ++) {
-      pRefFrameTmp  = pRefFrameY;
-      pCurFrameTmp  = pCurFrameY;
-      for (i = 0; i < iMbWidth; i++) {
-        m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
-        dAverageMotionIndex += pMotionTexture->uiMotionIndex;
-        dAverageTextureIndex += pMotionTexture->uiTextureIndex;
-        pMotionTexture++;
-        pRefFrameTmp += MB_WIDTH_LUMA;
-        pCurFrameTmp += MB_WIDTH_LUMA;
-
-      }
-      pRefFrameY += (iRefStride) << 4;
-      pCurFrameY += (iCurStride) << 4;
-    }
-  }
-  dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
-  dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
-  if ((dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN)) {
-    dAverageMotionIndex = 1.0;
-  }
-  if ((dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN)) {
-    dAverageTextureIndex = 1.0;
-  }
-  //  motion mb residual map to QP
-  //  texture mb original map to QP
-  iAverMotionTextureIndexToDeltaQp = 0;
-  dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
-
-  if (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE) {
-    dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
-  } else {
-    dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
-  }
-
-  pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
-  for (j = 0; j < iMbHeight; j ++) {
-    for (i = 0; i < iMbWidth; i++) {
-      double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
-      dQStep = (a - 1) / (a + MODEL_ALPHA);
-      dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
-
-      iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
-
-      a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
-      dQStep = (a - 1) / (a + MODEL_ALPHA);
-      dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
-
-      if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN)
-          || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE)) {
-        iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
-      }
-
-      m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
-      iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
-      pMotionTexture++;
-    }
-  }
-  m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
-
-  eReturn = RET_SUCCESS;
-
-  return eReturn;
-}
-
-
-
-EResult CAdaptiveQuantization::Set (int32_t iType, void* pParam) {
-  if (pParam == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  m_sAdaptiveQuantParam = * (SAdaptiveQuantizationParam*)pParam;
-
-  return RET_SUCCESS;
-}
-
-EResult CAdaptiveQuantization::Get (int32_t iType, void* pParam) {
-  if (pParam == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  SAdaptiveQuantizationParam* sAdaptiveQuantParam = (SAdaptiveQuantizationParam*)pParam;
-
-  sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
-
-  return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar,  int32_t iCpuFlag) {
-  pfVar = SampleVariance16x16_c;
-
-#ifdef X86_ASM
-  if (iCpuFlag & WELS_CPU_SSE2) {
-    pfVar = SampleVariance16x16_sse2;
-  }
-#endif
-}
-
-void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
-                            SMotionTextureUnit* pMotionTexture) {
-  uint32_t uiCurSquare = 0,  uiSquare = 0;
-  uint16_t uiCurSum = 0,  uiSum = 0;
-
-  for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {
-    for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {
-      uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);
-      uiSum += uiDiff;
-      uiSquare += uiDiff * uiDiff;
-
-      uiCurSum += pSrcY[x];
-      uiCurSquare += pSrcY[x] * pSrcY[x];
-    }
-    pRefY += iRefStride;
-    pSrcY += iSrcStride;
-  }
-
-  uiSum = uiSum >> 8;
-  pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);
-
-  uiCurSum = uiCurSum >> 8;
-  pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ /dev/null
@@ -1,85 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	        :  AdaptiveQuantization.h
- *
- * \brief	    :  adaptive quantization class of wels video processor class
- *
- * \date         :  2011/03/21
- *
- * \description  :  1. rewrite the package code of scene change detection class
- *
- */
-
-#ifndef WELSVP_ADAPTIVEQUANTIZATION_H
-#define WELSVP_ADAPTIVEQUANTIZATION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (VarFunc) (uint8_t* pRefY, int32_t iRefStrideY, uint8_t* pSrc, int32_t iSrcStrideY,
-                        SMotionTextureUnit* pMotionTexture);
-
-typedef VarFunc*   PVarFunc;
-
-VarFunc      SampleVariance16x16_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-VarFunc      SampleVariance16x16_sse2;
-WELSVP_EXTERN_C_END
-#endif
-
-
-class CAdaptiveQuantization : public IStrategy {
- public:
-  CAdaptiveQuantization (int32_t iCpuFlag);
-  ~CAdaptiveQuantization();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
-  EResult Set (int32_t iType, void* pParam);
-  EResult Get (int32_t iType, void* pParam);
-
- private:
-  void WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag);
-
- private:
-  PVarFunc			                   m_pfVar;
-  int32_t                                  m_CPUFlag;
-  SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/asm/asm_inc.asm
+++ /dev/null
@@ -1,235 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  sse2inc.asm
-;*
-;*  Abstract
-;*      macro and constant
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
-	%define MOVDQ movdqa
-%else
-	%define MOVDQ movdqu
-%endif
-
-%if 1
-	%define WELSEMMS	emms
-%else
-	%define WELSEMMS
-%endif
-
-BITS 32
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-%macro WELS_EXTERN 1
-	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
-	%else
-		global %1
-	%endif
-%endmacro
-
-%macro WELS_AbsW 2
-	pxor        %2, %2
-    psubw       %2, %1
-    pmaxsw      %1, %2
-%endmacro
-
-%macro MMX_XSwap  4
-    movq		%4, %2
-    punpckh%1   %4, %3
-    punpckl%1   %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
-    MMX_XSwap wd, %1, %2, %5
-    MMX_XSwap wd, %3, %4, %2
-    MMX_XSwap dq, %1, %3, %4
-    MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
-    movdqa      %4, %2
-    punpckl%1   %2, %3
-    punpckh%1   %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
-    SSE2_XSawp dq,  %1, %2, %5
-    SSE2_XSawp dq,  %3, %4, %2
-    SSE2_XSawp qdq, %1, %3, %4
-    SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
-    SSE2_XSawp wd,  %1, %2, %5
-    SSE2_XSawp wd,  %3, %4, %2
-    SSE2_XSawp dq,  %1, %3, %4
-    SSE2_XSawp dq,  %5, %2, %3
-    SSE2_XSawp qdq, %1, %5, %2
-    SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in:  m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
-	movdqa	%9,	%8
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%9, %4
-	SSE2_XSawp bw,  %7, %6, %4
-
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %3
-	SSE2_XSawp wd,  %7, %4, %3
-
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %5
-	SSE2_XSawp dq,  %7, %3, %5
-
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
-    movq         %1, %4
-    punpcklbw    %1, %3
-    movq         %2, %5
-    punpcklbw    %2, %3
-    psubw        %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
-	movdqa  %3, %2
-    paddw   %2, %1
-    psubw   %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro  SSE2_Copy8Times 2
-		movd	%1, %2
-		punpcklwd %1, %1
-		pshufd	%1,	%1,	0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro  SSE2_Copy16Times 2
-		movd		%1, %2
-		pshuflw		%1, %1, 0
-		punpcklqdq	%1, %1
-		packuswb	%1,	%1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro	WELS_Zero 1
-	pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
-	pcmpeqw %1,%1
-	psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/processing/src/asm/cpuid.asm
+++ /dev/null
@@ -1,169 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	cpu_mmx.asm
-;*
-;*  Abstract
-;*		verify cpuid feature support and cpuid detection
-;*
-;*  History
-;*      04/29/2009	Created
-;*
-;*************************************************************************/
-
-bits 32
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-%macro WELS_EXTERN 1
-	%ifdef PREFIX
-		global _%1
-		%define %1 _%1
-	%else
-		global %1
-	%endif
-%endmacro
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
-;******************************************************************************************
-;   int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WelsCPUIdVerify:
-    pushfd					; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
-	pushfd					; need push 2 EFLAGS, one for processing and the another one for storing purpose
-    pop     ecx				; get EFLAGS to bit manipulation
-    mov     eax, ecx		; store into ecx followed
-    xor     eax, 00200000h	; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
-	xor		eax, ecx		; get the ID flag bitwise, eax - 0: not support; otherwise: support
-    popfd					; store back EFLAGS and keep unchanged for system
-    ret
-
-WELS_EXTERN WelsCPUId
-ALIGN 16
-;****************************************************************************************************
-;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
-;****************************************************************************************************
-WelsCPUId:
-	push	ebx
-	push	edi
-
-	mov     eax, [esp+12]	; operating index
-    cpuid					; cpuid
-
-	; processing various information return
-	mov     edi, [esp+16]
-    mov     [edi], eax
-    mov     edi, [esp+20]
-    mov     [edi], ebx
-    mov     edi, [esp+24]
-    mov     [edi], ecx
-    mov     edi, [esp+28]
-    mov     [edi], edx
-
-	pop		edi
-    pop     ebx
-	ret
-
-WELS_EXTERN WelsCPUSupportAVX
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportAVX:
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-
-	; refer to detection of AVX addressed in INTEL AVX manual document
-	and ecx, 018000000H
-	cmp ecx, 018000000H		; check both OSXSAVE and AVX feature flags
-	jne avx_not_supported
-	; processor supports AVX instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne avx_not_supported
-	mov eax, 1
-	ret
-avx_not_supported:
-	mov eax, 0
-	ret
-
-WELS_EXTERN WelsCPUSupportFMA
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportFMA:
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-
-	; refer to detection of FMA addressed in INTEL AVX manual document
-	and ecx, 018001000H
-	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
-	jne fma_not_supported
-	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne fma_not_supported
-	mov eax, 1
-	ret
-fma_not_supported:
-	mov eax, 0
-	ret
-
-WELS_EXTERN WelsEmms
-ALIGN 16
-;******************************************************************************************
-;   void WelsEmms()
-;******************************************************************************************
-WelsEmms:
-	emms	; empty mmx technology states
-	ret
-
-
-
--- a/processing/src/asm/denoisefilter.asm
+++ /dev/null
@@ -1,263 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  predenoise.asm
-;*
-;*  Abstract
-;*      denoise for SVC2.1
-;*  History
-;*      4/13/2010 Created
-;*      7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
-
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
-
-		movdqa		%1,	%3
-		psubusb		%1,	%8
-
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1
-		paddusw		%4,	%1
-		paddusw		%5,	%2
-%endmacro
-
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
-%define		pushsize	4
-%define		pixel		esp + pushsize + 4
-%define		stride		esp + pushsize + 8
-BilateralLumaFilter8_sse2:
-		push		ebx
-
-		pxor		xmm7,	xmm7
-		mov			eax,	[pixel]
-		mov			ebx,	eax
-		movq		xmm6,	[eax]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
-
-		dec			eax
-		mov			ecx,	[stride]
-
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
-
-		sub			eax,	ecx
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
-
-		lea			eax,	[eax + ecx * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
-
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[ebx],	xmm5
-
-		pop ebx
-		ret
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
-		mov		edx,	[esp + 4]	; pixels
-		mov		ecx,	[esp + 8]	; stride
-
-		mov		eax,	ecx
-		add		eax,	eax
-		sub		edx,	eax			; pixels - 2 * stride
-		sub		edx,	2
-
-		pxor	xmm0,	xmm0
-		pxor	xmm3,	xmm3
-
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		add		edx,	eax
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		movdqu		xmm1,	[edx + ecx * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[edx + 2],		xmm3
-
-		ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ /dev/null
@@ -1,1225 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	upsampling.asm
-;*
-;*  Abstract
-;*		SIMD for pixel domain down sampling
-;*
-;*  History
-;*		10/22/2009	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1			; iSrcHeight >> 1
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
-
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
-
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
-
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
-
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
-	movq [edi  ], mm0
-	movq [edi+8], mm2
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1		; iSrcHeight >> 1
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movq [edi  ], mm0
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1		; iSrcHeight >> 1
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
-.xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+ecx]		; 2nd pSrc line
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	; to handle mm2, mm4
-	movq mm0, mm2		;
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movd [edi],	mm0
-
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1			; iSrcHeight >> 1
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
-
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line
-
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
-
-	; write pDst
-	movq [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1			; iSrcHeight >> 1
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
-
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
-
-	sar ebp, $1		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
-
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
-
-	; write pDst
-	movq [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-
-
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;                           unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
-
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
-
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
-
-
-DOWNSAMPLE:
-
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
-
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
-
-HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
-
-WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
-
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
-
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
-
-	loop	WIDTH
-
-WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-
-	dec		dword [tmpHeight]
-	jg		HEIGHT
-
-
-LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-
-LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	loop	LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
-
-
-
-
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;               unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
-
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
-
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
-
-
-FAST_DOWNSAMPLE:
-
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
-
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
-
-FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
-
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	paddw	xmm3,		xmm7			; inc u
-
-	loop	FAST_WIDTH
-
-FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
-
-
-FAST_LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-
-FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-
-	loop	FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ /dev/null
@@ -1,145 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
-    lea     eax,	[eax+ecx*2]
-
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx+%1],	xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+%1+0x10],	xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+0x10],	xmm0
-
-	SSE2_PRED_H_16X16_TWO_LINE   0x20
-	SSE2_PRED_H_16X16_TWO_LINE   0x40
-	SSE2_PRED_H_16X16_TWO_LINE   0x60
-	SSE2_PRED_H_16X16_TWO_LINE   0x80
-	SSE2_PRED_H_16X16_TWO_LINE   0xa0
-	SSE2_PRED_H_16X16_TWO_LINE   0xc0
-	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-
-    ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-
-    sub     eax, ecx
-    movdqa  xmm0, [eax]
-
-    movdqa  [edx], xmm0
-    movdqa  [edx+10h], xmm0
-    movdqa  [edx+20h], xmm0
-    movdqa  [edx+30h], xmm0
-    movdqa  [edx+40h], xmm0
-    movdqa  [edx+50h], xmm0
-    movdqa  [edx+60h], xmm0
-    movdqa  [edx+70h], xmm0
-    movdqa  [edx+80h], xmm0
-    movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0
-	movdqa  [edx+176], xmm0
-    movdqa  [edx+192], xmm0
-    movdqa  [edx+208], xmm0
-    movdqa  [edx+224], xmm0
-    movdqa  [edx+240], xmm0
-
-    ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ /dev/null
@@ -1,216 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  pixel_sse2.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
-    mov    ecx,    [esp+12]
-	mov    edx,    ecx
-    CACHE_SPLIT_CHECK edx, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	push   ebx
-	push   edi
-	mov    eax,    [esp+12]
-	mov    ebx,    [esp+16]
-
-    pxor   xmm7,   xmm7
-
-    mov    edi,    ecx
-    and    edi,    0x07
-    sub    ecx,    edi
-    mov    edx,    8
-    sub    edx,    edi
-
-    shl    edi,    3
-    shl    edx,    3
-    movd   xmm5,   edi
-    movd   xmm6,   edx
-	mov    edi,    8
-	add    edi,    ecx
-    mov    edx,    [esp+24]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-	lea    edi,    [edi+2*edx]
-
-    movq   xmm0,   [eax]
-	movhps xmm0,   [eax+ebx]
-
-	movq   xmm1,   [ecx]
-	movq   xmm2,   [edi]
-	movhps xmm1,   [ecx+edx]
-	movhps xmm2,   [edi+edx]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       eax,  xmm0
-	pop        edi
-	jmp        .return
-.pixel_sad_8x8_nsplit:
-    push   ebx
-    mov    eax,    [esp+8]
-	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       eax,  xmm0
-.return:
-	pop        ebx
-	ret
\ No newline at end of file
--- a/processing/src/asm/vaa.asm
+++ /dev/null
@@ -1,1589 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	vaa.asm
-;*
-;*	Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
-;	movdqa %1, %2
-;	punpcklbw %1, %3
-;	punpckhbw %2, %3
-;	paddw %1, %2
-;	pmaddwd %1, %4
-;	pshufd %2, %1, 04Eh	; 01001110 B
-;	paddd %1, %2
-;	pshufd %2, %1, 0B1h	; 10110001 B
-;	paddd %1, %2
-;%endmacro	; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
-	movdqa %1, %2
-	punpcklbw %1, %3
-	punpckhbw %2, %3
-	pmaddwd %1, %1
-	pmaddwd %2, %2
-	paddd %1, %2
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddd %1, %2
-	pshufd %2, %1, 0B1h	; 10110001 B
-	paddd %1, %2
-%endmacro	; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2  0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	[esi+ebx]
-	movdqa	xmm4,	[edi+ebx]
-	psadbw	xmm1,	xmm2
-	psadbw	xmm3,	xmm4
-	paddd	xmm6,	xmm1
-	paddd	xmm6,	xmm3
-	lea		esi,	[esi+ebx*2]
-	lea		edi,	[edi+ebx*2]
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm6,	xmm3
-
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm0
-	paddd	xmm5,	xmm3
-
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm2
-
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm7,	xmm3	; sad
-
-	movdqa	xmm3,	xmm1
-	pmaxub	xmm3,	xmm2
-	pminub	xmm2,	xmm1
-	psubb	xmm3,	xmm2	; diff
-
-	movdqa	xmm2,	xmm1
-	psadbw	xmm2,	xmm0
-	paddd	xmm6,	xmm2	; sum
-
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm5,	xmm1
-	paddd		xmm5,	xmm2	; sqsum
-
-	movdqa		xmm1,	xmm3
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm3	; sqdiff
-
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
-%define sad_reg			%1
-%define	sum_cur_reg		%2
-%define sum_ref_reg		%3
-%define	mad_reg			%4
-	movdqa	xmm1,		[esi]
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_cur_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	paddd	sum_ref_reg,			xmm3	; sum_ref
-
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-
-	add			esi,		ebx
-	add			edi,		ebx
-%endmacro
-
-
-%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
-%define max_reg  %1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		4
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		2
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		1
-	pmaxub	max_reg,	xmm1
-%endmacro
-
-%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
-%define sad_reg		%1
-%define	sum_reg		%2
-%define mad_reg		%3
-%define sqdiff_reg	%4
-	movdqa		xmm1,		[esi]
-	movdqa		xmm2,		xmm1
-	movdqa		xmm3,		xmm1
-	punpcklbw	xmm2,		xmm0
-	punpckhbw	xmm3,		xmm0
-	pmaddwd		xmm2,		xmm2
-	pmaddwd		xmm3,		xmm3
-	paddd		xmm2,		xmm3
-	movdqa		xmm3,		xmm2
-	psllq		xmm2,		32
-	psrlq		xmm3,		32
-	psllq		xmm3,		32
-	paddd		xmm2,		xmm3
-	paddd		sad_reg,	xmm2		; sqsum
-
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	pslldq	xmm3,		4
-	paddd	sum_reg,			xmm3	; sum_ref
-
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-
-	movdqa	xmm1,		xmm3
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-
-	movdqa		xmm3,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		sqdiff_reg,	xmm1
-	paddd		sqdiff_reg,	xmm3	; sqdiff
-
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-;	dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
-	push esi
-	push edi
-	push ebp
-	push ebx
-	push edx
-
-	mov esi, [esp+24]
-	mov edi, [esp+28]
-	mov ebx, [esp+32]
-	mov ecx, [esp+36]
-	mov edx, [esp+40]
-	pxor xmm0, xmm0
-.hloop:
-	mov eax, ebx
-	mov ebp, $0
-.wloop:
-	movdqa xmm1, [esi+ebp]
-	movdqa xmm2, [edi+ebp]
-	psadbw xmm1, xmm2
-	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
-	paddd xmm1, xmm2
-	paddd xmm0, xmm1
-	add ebp, 010h
-	dec eax
-	jnz near .wloop
-	lea esi, [esi+edx]
-	lea edi, [edi+edx]
-	dec ecx
-	jnz near .hloop
-
-	movd eax, xmm0
-	pop edx
-	pop ebx
-	pop ebp
-	pop edi
-	pop esi
-	ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
-	push esi
-	push edi
-	push ebx
-
-	sub esp, 16
-	%define SUM			[esp]
-	%define SUM_CUR		[esp+4]
-	%define SQR			[esp+8]
-	%define SQR_CUR		[esp+12]
-	%define PUSH_SIZE	28	; 12 + 16
-
-	mov edi, [esp+PUSH_SIZE+4]	; y_ref
-	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride
-	mov esi, [esp+PUSH_SIZE+12]	; y_src
-	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
-	mov ecx, 010h				; height = 16
-
-	pxor xmm7, xmm7
-	movdqu SUM, xmm7
-
-.hloops:
-	movdqa xmm0, [edi]		; y_ref
-	movdqa xmm1, [esi]		; y_src
-	movdqa xmm2, xmm0		; store first for future process
-	movdqa xmm3, xmm1
-	; sum += diff;
-	movdqa xmm4, xmm0
-	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
-	; to be continued for sum
-	pshufd xmm5, xmm4, 0C6h	; 11000110 B
-	paddw xmm4, xmm5
-	movd ebx, xmm4
-	add SUM, ebx
-
-	; sqr += diff * diff;
-	pmaxub xmm0, xmm1
-	pminub xmm1, xmm2
-	psubb xmm0, xmm1				; diff
-	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
-	movd ebx, xmm1
-	add SQR, ebx
-
-	; sum_cur += y_src[x];
-	movdqa xmm0, xmm3		; cur_orig
-	movdqa xmm1, xmm0
-	punpcklbw xmm0, xmm7
-	punpckhbw xmm1, xmm7
-	paddw xmm0, xmm1		; 8x2
-	SUM_WORD_8x2_SSE2 xmm0, xmm1
-	movd ebx, xmm0
-	and ebx, 0ffffh
-	add SUM_CUR, ebx
-
-	; sqr_cur += y_src[x] * y_src[x];
-	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
-	movd ebx, xmm0
-	add SQR_CUR, ebx
-
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .hloops
-
-	mov ebx, 0
-	mov bx, word SUM
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR
-	sar ecx, 8
-	sub ecx, ebx
-	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
-	mov [edi], cx				; to store uiMotionIndex
-	mov ebx, 0
-	mov bx, word SUM_CUR
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR_CUR
-	sar ecx, 8
-	sub ecx, ebx
-	mov [edi+2], cx				; to store uiTextureIndex
-
-	%undef SUM
-	%undef SUM_CUR
-	%undef SQR
-	%undef SQR_CUR
-	%undef PUSH_SIZE
-
-	add esp, 16
-	pop ebx
-	pop edi
-	pop esi
-
-	ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-
-	pxor xmm7, xmm7
-
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+24], xmm0
-
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low word truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-
-	pxor xmm7, xmm7
-
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+24], xmm1
-
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low work truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-%endif
-
-
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
-;								 int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define		ref_orig			esp + pushsize + 4
-%define		cur_orig			esp + pushsize + 8
-%define		iPicStride			esp + pushsize + 12
-%define		gom_pixel_num		esp + pushsize + 16
-%define		pSum				esp + pushsize + 20
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[ref_orig]
-	mov		edi,	[cur_orig]
-	mov		ebx,	[iPicStride]
-	mov		eax,	[gom_pixel_num]
-	mov		ecx,	16					;MB_WIDTH_LUMA
-	pxor	xmm0,	xmm0
-mb_width_loop_p:
-	mov		edx,	esi
-	add		edx,	eax			; end address
-gom_row_loop_p:
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	psadbw	xmm1,	xmm2
-	paddd	xmm0,	xmm1
-	add		esi,	16
-	add		edi,	16
-	cmp		esi,	edx
-	jl		gom_row_loop_p
-
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	ebx
-	add		edi,	ebx
-	loop	mb_width_loop_p
-
-	movdqa	xmm1,	xmm0
-	psrldq	xmm1,	8
-	paddd	xmm1,	xmm0
-	movd	eax,	xmm1
-	mov		edx,	[pSum]	; pSum
-	add		[edx],	eax
-
-%undef		ref_orig
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pushsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
-;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define		cur_orig			esp + pushsize + 4
-%define		iPicStride			esp + pushsize + 8
-%define		gom_pixel_num		esp + pushsize + 12
-%define		pSum				esp + pushsize + 16
-%define		pSqrSum				esp + pushsize + 20
-%define		pushsize			8
-	push		esi
-	push		ebx
-	mov			esi,	[cur_orig]
-	mov			eax,	[gom_pixel_num]
-	mov			ebx,	[iPicStride]
-	mov			ecx,	16					;MB_WIDTH_LUMA
-	pxor		xmm0,	xmm0				; zero
-	pxor		xmm1,	xmm1				; sum
-	pxor		xmm2,	xmm2				; sqr sum
-mb_width_loop_i:
-	mov			edx,	esi
-	add			edx,	eax			; end address
-gom_row_loop_i:
-	movdqa		xmm3,	[esi]
-	movdqa		xmm4,	xmm3
-	psadbw		xmm4,	xmm0
-	paddd		xmm1,	xmm4
-	movdqa		xmm4,	xmm3
-	punpcklbw	xmm4,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm4,	xmm4
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm2,	xmm3
-	paddd		xmm2,	xmm4
-	add			esi,	16
-	cmp			esi,	edx
-	jl			gom_row_loop_i
-
-	sub			esi,	eax
-	add			esi,	ebx
-	loop		mb_width_loop_i
-
-	movdqa		xmm3,	xmm1
-	psrldq		xmm3,	8
-	paddd		xmm1,	xmm3
-	movd		eax,	xmm1
-	mov			edx,	[pSum]
-	add			[edx],	eax
-
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	8
-	paddd		xmm2,	xmm3
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	4
-	paddd		xmm2,	xmm3
-	movd		eax,	xmm2
-	mov			edx,	[pSqrSum]
-	add			[edx],	eax
-
-
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pSqrSum
-%undef		pushsize
-	pop			ebx
-	pop			esi
-	ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define		cur_data			esp + pushsize + 4
-%define		ref_data			esp + pushsize + 8
-%define		iPicWidth			esp + pushsize + 12
-%define		iPicHeight			esp + pushsize + 16
-%define		iPicStride			esp + pushsize + 20
-%define		psadframe			esp + pushsize + 24
-%define		psad8x8				esp + pushsize + 28
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4								; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-height_loop:
-	mov		ecx,	dword [iPicWidth]
-	push	esi
-	push	edi
-width_loop:
-	pxor	xmm6,	xmm6		;
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-
-	pxor	xmm6,	xmm6
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-
-	dec		ecx
-	jnz		width_loop
-
-	pop		edi
-	pop		esi
-	add		esi,	eax
-	add		edi,	eax
-
-	dec	dword [iPicHeight]
-	jnz		height_loop
-
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		pushsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define		localsize		8
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-var_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-var_width_loop:
-	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
-	pxor	xmm5,	xmm5		; pSum16x16
-	pxor	xmm4,	xmm4		; sqsum_16x16
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-
-	pxor	xmm6,	xmm6
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm5
-	psrldq	xmm1,	8
-	paddd	xmm5,	xmm1
-	movd	[ebp],	xmm5
-	add		dword [psum16x16], 4
-
-	movdqa	xmm5,	xmm4
-	psrldq	xmm5,	8
-	paddd	xmm4,	xmm5
-	movdqa	xmm3,	xmm4
-	psrldq	xmm3,	4
-	paddd	xmm4,	xmm3
-
-	mov		ebp,	[psqsum16x16]
-	movd	[ebp],	xmm4
-	add		dword [psqsum16x16], 4
-
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-
-	dec		ecx
-	jnz		var_width_loop
-
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-
-	dec	dword [iPicHeight]
-	jnz		var_height_loop
-
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_width_loop:
-	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
-	pxor	xmm6,	xmm6		; pSum16x16
-	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx],		xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+4],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-
-	pxor	xmm7,	xmm7
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx+8],	xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+12],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm6
-	psrldq	xmm1,	8
-	paddd	xmm6,	xmm1
-	movd	[ebp],	xmm6
-	add		dword [psum16x16], 4
-
-	mov		ebp,	[psqsum16x16]
-	pshufd	xmm6,	xmm5,	14 ;00001110
-	paddd	xmm6,	xmm5
-	pshufd	xmm5,	xmm6,	1  ;00000001
-	paddd	xmm5,	xmm6
-	movd	[ebp],	xmm5
-	add		dword [psqsum16x16], 4
-
-	mov		ebp,	[psqdiff16x16]
-	pshufd	xmm5,	xmm4,	14	; 00001110
-	paddd	xmm5,	xmm4
-	pshufd	xmm4,	xmm5,	1	; 00000001
-	paddd	xmm4,	xmm5
-	movd	[ebp],	xmm4
-	add		dword	[psqdiff16x16],	4
-
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-
-	dec		ecx
-	jnz		sqdiff_width_loop
-
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-
-	dec	dword [iPicHeight]
-	jnz		sqdiff_height_loop
-
-	mov		ebx,	[tmp_sadframe]
-	mov		eax,	[psadframe]
-	mov		[eax],	ebx
-
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		tmp_sadframe
-%undef		pushsize
-%undef		localsize
-	ret
-
-
-
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		p_sd8x8				esp + pushsize + localsize + 32
-%define		p_mad8x8			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_ecx				esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	xor		ebp,	ebp
-	pxor	xmm0,	xmm0
-bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8
-	pxor	xmm6,	xmm6		; sum_cur_8x8
-	pxor	xmm5,	xmm5		; sum_ref_8x8
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-
-
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-
-	pslldq		xmm7,	4
-	pslldq		xmm6,	4
-	pslldq		xmm5,	4
-
-
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-
-	mov		edx,	[psad8x8]
-	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[psad8x8],	edx					; sad8x8
-
-	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
-	pshufd	xmm2,	xmm1,	00000011b
-	paddd	xmm1,	xmm2
-	movd	edx,	xmm1
-	add		ebp,	edx						; sad frame
-
-	mov		edx,	[p_sd8x8]
-	psubd	xmm6,	xmm5
-	pshufd	xmm1,	xmm6,	10001101b
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[p_sd8x8],	edx
-
-
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		bgd_width_loop
-
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-
-	dec		dword [iPicHeight]
-	jnz		bgd_height_loop
-
-	mov		edx,	[psadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define		localsize		16
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		p_sd8x8				esp + pushsize + localsize + 44
-%define		p_mad8x8			esp + pushsize + localsize + 48
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		tmp_ecx				esp + 12
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-
-	mov		edx,		[psum16x16]
-	movdqa	xmm1,		xmm6
-	pshufd	xmm2,		xmm1,		00001110b
-	paddd	xmm1,		xmm2
-	movd	[edx],		xmm1				; sum
-
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-	psrlq	xmm7,	32
-	psllq	xmm7,	32			; clear sad
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-
-	mov		edx,			[psum16x16]
-	movdqa	xmm1,			xmm6
-	pshufd	xmm2,			xmm1,		00001110b
-	paddd	xmm1,			xmm2
-	movd	ebp,			xmm1				; sum
-	add		[edx],			ebp
-	add		edx,			4
-	mov		[psum16x16],	edx
-
-	mov		edx,			[psqsum16x16]
-	psrlq	xmm7,			32
-	pshufd	xmm2,			xmm7,		00001110b
-	paddd	xmm2,			xmm7
-	movd	[edx],			xmm2				; sqsum
-	add		edx,			4
-	mov		[psqsum16x16],	edx
-
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-
-	mov		edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-	mov		edx,		[psqdiff16x16]
-	pshufd	xmm1,		xmm4,		00001110b
-	paddd	xmm4,		xmm1
-	pshufd	xmm1,		xmm4,		00000001b
-	paddd	xmm4,		xmm1
-	movd	[edx],		xmm4
-	add		edx,		4
-	mov		[psqdiff16x16],	edx
-
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		sqdiff_bgd_width_loop
-
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-
-	dec	dword [iPicHeight]
-	jnz		sqdiff_bgd_height_loop
-
-	mov		edx,	[psadframe]
-	mov		ebp,	[tmp_sadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
--- a/processing/src/backgounddetection/BackgroundDetection.cpp
+++ /dev/null
@@ -1,389 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "BackgroundDetection.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define LOG2_BGD_OU_SIZE    (4)
-#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
-#define BGD_OU_SIZE         (1<<LOG2_BGD_OU_SIZE)
-#define BGD_OU_SIZE_UV      (BGD_OU_SIZE>>1)
-#define BGD_THD_SAD         (2*BGD_OU_SIZE*BGD_OU_SIZE)
-#define	BGD_THD_ASD_UV      (4*BGD_OU_SIZE_UV)
-#define LOG2_MB_SIZE        (4)
-#define OU_SIZE_IN_MB       (BGD_OU_SIZE >> 4)
-#define Q_FACTOR            (8)
-#define BGD_DELTA_QP_THD    (3)
-
-#define OU_LEFT		(0x01)
-#define OU_RIGHT	(0x02)
-#define OU_TOP		(0x04)
-#define OU_BOTTOM	(0x08)
-
-CBackgroundDetection::CBackgroundDetection (int32_t iCpuFlag) {
-  m_eMethod = METHOD_BACKGROUND_DETECTION;
-  WelsMemset (&m_BgdParam, 0, sizeof (m_BgdParam));
-  m_iLargestFrameSize = 0;
-}
-
-CBackgroundDetection::~CBackgroundDetection() {
-  FreeOUArrayMemory();
-}
-
-EResult CBackgroundDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  EResult eReturn = RET_INVALIDPARAM;
-
-  if (pSrcPixMap == NULL || pRefPixMap == NULL)
-    return eReturn;
-
-  m_BgdParam.pCur[0] = (uint8_t*)pSrcPixMap->pPixel[0];
-  m_BgdParam.pCur[1] = (uint8_t*)pSrcPixMap->pPixel[1];
-  m_BgdParam.pCur[2] = (uint8_t*)pSrcPixMap->pPixel[2];
-  m_BgdParam.pRef[0] = (uint8_t*)pRefPixMap->pPixel[0];
-  m_BgdParam.pRef[1] = (uint8_t*)pRefPixMap->pPixel[1];
-  m_BgdParam.pRef[2] = (uint8_t*)pRefPixMap->pPixel[2];
-  m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
-  m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
-  m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
-  m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
-  m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
-
-  int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
-  if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize) {
-    FreeOUArrayMemory();
-    m_BgdParam.pOU_array = AllocateOUArrayMemory (m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
-    m_iLargestFrameSize = iCurFrameSize;
-  }
-
-  if (m_BgdParam.pOU_array == NULL)
-    return eReturn;
-
-  BackgroundDetection (&m_BgdParam);
-
-  return RET_SUCCESS;
-}
-
-EResult CBackgroundDetection::Set (int32_t iType, void* pParam) {
-  if (pParam == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  SBGDInterface* pInterface = (SBGDInterface*)pParam;
-
-  m_BgdParam.pBackgroundMbFlag = (int8_t*)pInterface->pBackgroundMbFlag;
-  m_BgdParam.pCalcRes = pInterface->pCalcRes;
-
-  return RET_SUCCESS;
-}
-
-inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight) {
-  int32_t	iMaxOUWidth	= (BGD_OU_SIZE - 1 + iWidth) >> LOG2_BGD_OU_SIZE;
-  int32_t	iMaxOUHeight	= (BGD_OU_SIZE - 1 + iHeight) >> LOG2_BGD_OU_SIZE;
-  return (SBackgroundOU*)WelsMalloc (iMaxOUWidth * iMaxOUHeight * sizeof (SBackgroundOU));
-}
-
-inline void CBackgroundDetection::FreeOUArrayMemory() {
-  _SafeFree (m_BgdParam.pOU_array);
-}
-
-void CBackgroundDetection::GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
-    SBackgroundOU* pBgdOU) {
-  int32_t	iSubSD[4];
-  uint8_t	iSubMAD[4];
-  int32_t	iSubSAD[4];
-
-  uint8_t (*pMad8x8)[4];
-  int32_t (*pSad8x8)[4];
-  int32_t (*pSd8x8)[4];
-
-  pSad8x8 = sVaaCalcInfo->pSad8x8;
-  pMad8x8 = sVaaCalcInfo->pMad8x8;
-  pSd8x8  = sVaaCalcInfo->pSumOfDiff8x8;
-
-  iSubSAD[0] = pSad8x8[iMbIndex][0];
-  iSubSAD[1] = pSad8x8[iMbIndex][1];
-  iSubSAD[2] = pSad8x8[iMbIndex][2];
-  iSubSAD[3] = pSad8x8[iMbIndex][3];
-
-  iSubSD[0] = pSd8x8[iMbIndex][0];
-  iSubSD[1] = pSd8x8[iMbIndex][1];
-  iSubSD[2] = pSd8x8[iMbIndex][2];
-  iSubSD[3] = pSd8x8[iMbIndex][3];
-
-  iSubMAD[0] = pMad8x8[iMbIndex][0];
-  iSubMAD[1] = pMad8x8[iMbIndex][1];
-  iSubMAD[2] = pMad8x8[iMbIndex][2];
-  iSubMAD[3] = pMad8x8[iMbIndex][3];
-
-  pBgdOU->iSD	= iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
-  pBgdOU->iSAD	= iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
-  pBgdOU->iSD	= WELS_ABS (pBgdOU->iSD);
-
-  // get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
-  pBgdOU->iMAD = WELS_MAX (WELS_MAX (iSubMAD[0], iSubMAD[1]), WELS_MAX (iSubMAD[2], iSubMAD[3]));
-  pBgdOU->iMinSubMad = WELS_MIN (WELS_MIN (iSubMAD[0], iSubMAD[1]), WELS_MIN (iSubMAD[2], iSubMAD[3]));
-
-  // get difference between the max and min SD of the SDs of sub-blocks of OU
-  pBgdOU->iMaxDiffSubSd = WELS_MAX (WELS_MAX (iSubSD[0], iSubSD[1]), WELS_MAX (iSubSD[2], iSubSD[3])) -
-                          WELS_MIN (WELS_MIN (iSubSD[0], iSubSD[1]), WELS_MIN (iSubSD[2], iSubSD[3]));
-}
-
-void CBackgroundDetection::ForegroundBackgroundDivision (vBGDParam* pBgdParam) {
-  int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
-  int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
-  int32_t iPicWidthInMb	= (15 + pBgdParam->iBgdWidth) >> 4;
-
-  SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
-
-  for (int32_t j = 0; j < iPicHeightInOU; j ++) {
-    for (int32_t i = 0; i < iPicWidthInOU; i++) {
-      GetOUParameters (pBgdParam->pCalcRes, (j * iPicWidthInMb + i) << (LOG2_BGD_OU_SIZE - LOG2_MB_SIZE), iPicWidthInMb,
-                       pBackgroundOU);
-
-      pBackgroundOU->iBackgroundFlag = 0;
-      if (pBackgroundOU->iMAD > 63) {
-        pBackgroundOU++;
-        continue;
-      }
-      if ((pBackgroundOU->iMaxDiffSubSd <= pBackgroundOU->iSAD >> 3
-           || pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR))
-          && pBackgroundOU->iSAD < (BGD_THD_SAD << 1)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
-        if (pBackgroundOU->iSAD <= BGD_OU_SIZE * Q_FACTOR) {
-          pBackgroundOU->iBackgroundFlag = 1;
-        } else {
-          pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
-                                           (pBackgroundOU->iSD < (pBackgroundOU->iSAD * 3) >> 2) :
-                                           (pBackgroundOU->iSD << 1 < pBackgroundOU->iSAD);
-        }
-      }
-      pBackgroundOU++;
-    }
-  }
-}
-inline int32_t CBackgroundDetection::CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride) {
-  int32_t	ASD = 0;
-  int32_t	idx;
-  for (idx = 0; idx < BGD_OU_SIZE_UV; idx++) {
-    ASD += *pOriCur - *pOriRef;
-    pOriRef += iStride;
-    pOriCur += iStride;
-  }
-  return WELS_ABS (ASD);
-}
-
-inline bool_t CBackgroundDetection::ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
-    SBackgroundOU* pOUNeighbours[]) {
-  SBackgroundOU* pOU_L	= pOUNeighbours[0];
-  SBackgroundOU* pOU_R	= pOUNeighbours[1];
-  SBackgroundOU* pOU_U	= pOUNeighbours[2];
-  SBackgroundOU* pOU_D	= pOUNeighbours[3];
-
-  if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad << 1) {
-    int32_t iMaxNbrForegroundMad;
-    int32_t iMaxNbrBackgroundMad;
-    int32_t	aBackgroundMad[4];
-    int32_t	aForegroundMad[4];
-
-    aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
-    aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
-    aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
-    aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
-    iMaxNbrForegroundMad = WELS_MAX (WELS_MAX (aForegroundMad[0], aForegroundMad[1]), WELS_MAX (aForegroundMad[2],
-                                     aForegroundMad[3]));
-
-    aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
-    aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
-    aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
-    aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
-    iMaxNbrBackgroundMad = WELS_MAX (WELS_MAX (aBackgroundMad[0], aBackgroundMad[1]), WELS_MAX (aBackgroundMad[2],
-                                     aBackgroundMad[3]));
-
-    return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad << 2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad << 1
-            && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad * 3) >> 1));
-  }
-  return 0;
-}
-
-inline bool_t CBackgroundDetection::ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags,
-    int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam* pBgdParam) {
-  static const int8_t kaOUPos[4]	= {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
-  int32_t	aEdgeOffset[4]	= {0, BGD_OU_SIZE_UV - 1, 0, iPicStrideUV* (BGD_OU_SIZE_UV - 1)};
-  int32_t	iStride[4]		= {iPicStrideUV, iPicStrideUV, 1, 1};
-
-  // V component first, high probability because V stands for red color and human skin colors have more weight on this component
-  for (int32_t i = 0; i < 4; i++) {
-    if (iNeighbourForegroundFlags & kaOUPos[i]) {
-      uint8_t* pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
-      uint8_t* pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
-      if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
-        return 1;
-      }
-    }
-  }
-  // U component, which stands for blue color, low probability
-  for (int32_t i = 0; i < 4; i++) {
-    if (iNeighbourForegroundFlags & kaOUPos[i]) {
-      uint8_t* pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
-      uint8_t* pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
-      if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
-        return 1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-inline void CBackgroundDetection::ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[],
-    vBGDParam* pBgdParam, int32_t	iChromaSampleStartPos) {
-  int32_t iPicStrideUV	= pBgdParam->iStride[1];
-  int32_t iSumNeighBackgroundFlags	= pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
-                                      pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
-
-  if (pBackgroundOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
-    switch (iSumNeighBackgroundFlags) {
-    case 0:
-    case 1:
-      pBackgroundOU->iBackgroundFlag = 0;
-      break;
-    case 2:
-    case 3:
-      pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
-
-      // chroma component check
-      if (pBackgroundOU->iBackgroundFlag == 1) {
-        int8_t	iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag) << 1)
-                                            | ((!pOUNeighbours[2]->iBackgroundFlag) << 2) | ((!pOUNeighbours[3]->iBackgroundFlag) << 3);
-        pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma (iNeighbourForegroundFlags, iChromaSampleStartPos,
-                                         iPicStrideUV, pBgdParam);
-      }
-      break;
-    default:
-      break;
-    }
-  }
-}
-inline void CBackgroundDetection::BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]) {
-  if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
-    int32_t	iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
-                                       pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
-    int32_t	sumNbrBGsad = (pOUNeighbours[0]->iSAD & (-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD &
-                          (-pOUNeighbours[2]->iBackgroundFlag))
-                          + (pOUNeighbours[1]->iSAD & (-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD &
-                              (-pOUNeighbours[3]->iBackgroundFlag));
-    if (pBackgroundOU->iSAD * iSumNeighBackgroundFlags <= (3 * sumNbrBGsad) >> 1) {
-      if (iSumNeighBackgroundFlags == 4) {
-        pBackgroundOU->iBackgroundFlag = 1;
-      } else {
-        if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag)
-            || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag)) {
-          pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
-        }
-      }
-    }
-  }
-}
-
-inline void CBackgroundDetection::SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb,
-    int32_t iBackgroundMbFlag) {
-  *pBackgroundMbFlag = iBackgroundMbFlag;
-}
-
-inline void CBackgroundDetection::UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag,
-    int32_t iPicWidthInOU, int32_t iPicWidthInMb) {
-  if (pCurOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
-    SBackgroundOU*	pOU_L = pCurOU - 1;
-    SBackgroundOU*	pOU_R = pCurOU + 1;
-    SBackgroundOU*	pOU_U = pCurOU - iPicWidthInOU;
-    SBackgroundOU*	pOU_D = pCurOU + iPicWidthInOU;
-    if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1) {
-      SetBackgroundMbFlag (pBackgroundMbFlag, iPicWidthInMb, 0);
-      pCurOU->iBackgroundFlag = 0;
-    }
-  }
-}
-
-void CBackgroundDetection::ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam) {
-  int32_t iPicStrideUV		= pBgdParam->iStride[1];
-  int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
-  int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
-  int32_t iOUStrideUV		= iPicStrideUV << (LOG2_BGD_OU_SIZE - 1);
-  int32_t iPicWidthInMb	= (15 + pBgdParam->iBgdWidth) >> 4;
-
-  SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
-  int8_t*	pVaaBackgroundMbFlag   = (int8_t*)pBgdParam->pBackgroundMbFlag;
-  SBackgroundOU*	pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
-
-  pBackgroundOU	= pBgdParam->pOU_array;
-  pOUNeighbours[2]	= pBackgroundOU;//top OU
-  for (int32_t j = 0; j < iPicHeightInOU; j ++) {
-    int8_t* pRowSkipFlag = pVaaBackgroundMbFlag;
-    pOUNeighbours[0]	= pBackgroundOU;//left OU
-    pOUNeighbours[3]	= pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU - 1) - 1)); //bottom OU
-    for (int32_t i = 0; i < iPicWidthInOU; i++) {
-      pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU - 1); //right OU
-
-      if (pBackgroundOU->iBackgroundFlag)
-        ForegroundDilation (pBackgroundOU, pOUNeighbours, pBgdParam, j * iOUStrideUV + (i << LOG2_BGD_OU_SIZE_UV));
-      else
-        BackgroundErosion (pBackgroundOU, pOUNeighbours);
-
-      // check the up OU
-      if (j > 1 && i > 0 && i < iPicWidthInOU - 1 && pOUNeighbours[2]->iBackgroundFlag == 1) {
-        UpperOUForegroundCheck (pOUNeighbours[2], pRowSkipFlag - OU_SIZE_IN_MB * iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
-      }
-
-      SetBackgroundMbFlag (pRowSkipFlag, iPicWidthInMb, pBackgroundOU->iBackgroundFlag);
-
-      // preparation for the next OU
-      pRowSkipFlag += OU_SIZE_IN_MB;
-      pOUNeighbours[0] = pBackgroundOU;
-      pOUNeighbours[2]++;
-      pOUNeighbours[3]++;
-      pBackgroundOU++;
-    }
-    pOUNeighbours[2]	= pBackgroundOU - iPicWidthInOU;
-    pVaaBackgroundMbFlag += OU_SIZE_IN_MB * iPicWidthInMb;
-  }
-}
-
-void CBackgroundDetection::BackgroundDetection (vBGDParam* pBgdParam) {
-  // 1st step: foreground/background coarse division
-  ForegroundBackgroundDivision (pBgdParam);
-
-  // 2nd step: foreground dilation and background erosion
-  ForegroundDilationAndBackgroundErosion (pBgdParam);
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/backgounddetection/BackgroundDetection.h
+++ /dev/null
@@ -1,106 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	       :  BackgroundDetection.h
- *
- * \brief	     :  background detection class of wels video processor class
- *
- * \date        :  2011/03/17
- *
- * \description :  1. rewrite the package code of background detection class
- *
- */
-
-#ifndef WELSVP_BACKGROUNDDETECTION_H
-#define WELSVP_BACKGROUNDDETECTION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef struct {
-  int32_t	iBackgroundFlag;
-  int32_t	iSAD;
-  int32_t	iSD;
-  int32_t	iMAD;
-  int32_t	iMinSubMad;
-  int32_t	iMaxDiffSubSd;
-} SBackgroundOU;
-
-class CBackgroundDetection : public IStrategy {
- public:
-  CBackgroundDetection (int32_t iCpuFlag);
-  ~CBackgroundDetection();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
-  EResult Set (int32_t iType, void* pParam);
-
- private:
-  struct vBGDParam {
-    uint8_t*   pCur[3];
-    uint8_t*   pRef[3];
-    int32_t	   iBgdWidth;
-    int32_t	   iBgdHeight;
-    int32_t    iStride[3];
-    SBackgroundOU*  	pOU_array;
-    int8_t*  	pBackgroundMbFlag;
-    SVAACalcResult*  pCalcRes;
-  } m_BgdParam;
-
-  int32_t     m_iLargestFrameSize;
-
- private:
-  inline SBackgroundOU* AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight);
-  inline void     FreeOUArrayMemory();
-  inline int32_t  CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride);
-  inline bool_t   ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
-      SBackgroundOU* pOUNeighbours[]); //Foreground_Dilation_2_3_Luma
-  inline bool_t   ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos,
-      int32_t iPicStrideUV, vBGDParam* pBgdParam);//Foreground_Dilation_2_3_Chroma
-  inline void     ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[], vBGDParam* pBgdParam,
-                                      int32_t	iChromaSampleStartPos);
-  inline void     BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]);
-  inline void     SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
-  inline void     UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag, int32_t iPicWidthInOU,
-                                          int32_t iPicWidthInMb);
-
-  void    GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
-                           SBackgroundOU* pBackgroundOU);
-  void    ForegroundBackgroundDivision (vBGDParam* pBgdParam);
-  void    ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam);
-  void    BackgroundDetection (vBGDParam* pBgdParam);
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/WelsFrameWork.cpp
+++ /dev/null
@@ -1,301 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "WelsFrameWork.h"
-#include "cpu.h"
-#include "../denoise/denoise.h"
-#include "../downsample/downsample.h"
-#include "../scenechangedetection/SceneChangeDetection.h"
-#include "../vaacalc/vaacalculation.h"
-#include "../backgounddetection/BackgroundDetection.h"
-#include "../adaptivequantization/AdaptiveQuantization.h"
-#include "../complexityanalysis/ComplexityAnalysis.h"
-#include "../imagerotate/imagerotate.h"
-
-
-/* interface API implement */
-
-EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion) {
-  if (iVersion & 0x8000)
-    return nsWelsVP::CreateSpecificVpInterface ((IWelsVP**)ppCtx);
-  else if (iVersion & 0x7fff)
-    return nsWelsVP::CreateSpecificVpInterface ((IWelsVPc**)ppCtx);
-  else
-    return RET_INVALIDPARAM;
-}
-
-EResult WELSAPI DestroyVpInterface (void* pCtx, int iVersion) {
-  if (iVersion & 0x8000)
-    return nsWelsVP::DestroySpecificVpInterface ((IWelsVP*)pCtx);
-  else if (iVersion & 0x7fff)
-    return nsWelsVP::DestroySpecificVpInterface ((IWelsVPc*)pCtx);
-  else
-    return RET_INVALIDPARAM;
-}
-
-WELSVP_NAMESPACE_BEGIN
-
-///////////////////////////////////////////////////////////////////////
-
-EResult CreateSpecificVpInterface (IWelsVP** ppCtx) {
-  EResult  eReturn = RET_FAILED;
-
-  CVpFrameWork* pFr = new CVpFrameWork (1, eReturn);
-  if (pFr) {
-    *ppCtx  = (IWelsVP*)pFr;
-    eReturn = RET_SUCCESS;
-  }
-
-  return eReturn;
-}
-
-EResult DestroySpecificVpInterface (IWelsVP* pCtx) {
-  _SafeDelete (pCtx);
-
-  return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-CVpFrameWork::CVpFrameWork (uint32_t uiThreadsNum, EResult& eReturn) {
-  int32_t iCoreNum = 1;
-#ifndef X86_ASM
-  uint32_t uiCPUFlag = 0;
-#else
-  uint32_t uiCPUFlag = WelsCPUFeatureDetect (&iCoreNum);
-#endif
-
-  for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
-    IStrategy* pStrategy = m_pStgChain[i];
-    pStrategy = CreateStrategy (WelsStaticCast (EMethods, i + 1), uiCPUFlag);
-    m_pStgChain[i] = pStrategy;
-  }
-
-  WelsMutexInit (&m_mutes);
-
-  eReturn = RET_SUCCESS;
-}
-
-CVpFrameWork::~CVpFrameWork() {
-  for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
-    if (m_pStgChain[i]) {
-      Uninit (m_pStgChain[i]->m_eMethod);
-      _SafeDelete (m_pStgChain[i]);
-    }
-  }
-
-  WelsMutexDestroy (&m_mutes);
-}
-
-EResult CVpFrameWork::Init (int32_t iType, void* pCfg) {
-  EResult eReturn   = RET_SUCCESS;
-  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
-  Uninit (iType);
-
-  WelsMutexLock (&m_mutes);
-
-  IStrategy* pStrategy = m_pStgChain[iCurIdx];
-  if (pStrategy)
-    eReturn = pStrategy->Init (0, pCfg);
-
-  WelsMutexUnlock (&m_mutes);
-
-  return eReturn;
-}
-
-EResult CVpFrameWork::Uninit (int32_t iType) {
-  EResult eReturn        = RET_SUCCESS;
-  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
-  WelsMutexLock (&m_mutes);
-
-  IStrategy* pStrategy = m_pStgChain[iCurIdx];
-  if (pStrategy)
-    eReturn = pStrategy->Uninit (0);
-
-  WelsMutexUnlock (&m_mutes);
-
-  return eReturn;
-}
-
-EResult CVpFrameWork::Flush (int32_t iType) {
-  EResult eReturn        = RET_SUCCESS;
-
-  return eReturn;
-}
-
-EResult CVpFrameWork::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
-  EResult eReturn        = RET_NOTSUPPORTED;
-  EMethods eMethod    = WelsVpGetValidMethod (iType);
-  int32_t iCurIdx    = WelsStaticCast (int32_t, eMethod) - 1;
-  SPixMap sSrcPic;
-  SPixMap sDstPic;
-  memset (&sSrcPic, 0, sizeof (sSrcPic)); // confirmed_safe_unsafe_usage
-  memset (&sDstPic, 0, sizeof (sDstPic)); // confirmed_safe_unsafe_usage
-
-  if (pSrcPixMap) sSrcPic = *pSrcPixMap;
-  if (pDstPixMap) sDstPic = *pDstPixMap;
-  if (!CheckValid (eMethod, sSrcPic, sDstPic))
-    return RET_INVALIDPARAM;
-
-  WelsMutexLock (&m_mutes);
-
-  IStrategy* pStrategy = m_pStgChain[iCurIdx];
-  if (pStrategy)
-    eReturn = pStrategy->Process (0, &sSrcPic, &sDstPic);
-
-  WelsMutexUnlock (&m_mutes);
-
-  return eReturn;
-}
-
-EResult CVpFrameWork::Get (int32_t iType, void* pParam) {
-  EResult eReturn        = RET_SUCCESS;
-  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
-  if (!pParam)
-    return RET_INVALIDPARAM;
-
-  WelsMutexLock (&m_mutes);
-
-  IStrategy* pStrategy = m_pStgChain[iCurIdx];
-  if (pStrategy)
-    eReturn = pStrategy->Get (0, pParam);
-
-  WelsMutexUnlock (&m_mutes);
-
-  return eReturn;
-}
-
-EResult CVpFrameWork::Set (int32_t iType, void* pParam) {
-  EResult eReturn        = RET_SUCCESS;
-  int32_t iCurIdx    = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
-  if (!pParam)
-    return RET_INVALIDPARAM;
-
-  WelsMutexLock (&m_mutes);
-
-  IStrategy* pStrategy = m_pStgChain[iCurIdx];
-  if (pStrategy)
-    eReturn = pStrategy->Set (0, pParam);
-
-  WelsMutexUnlock (&m_mutes);
-
-  return eReturn;
-}
-
-EResult CVpFrameWork::SpecialFeature (int32_t iType, void* pIn, void* pOut) {
-  EResult eReturn        = RET_SUCCESS;
-
-  return eReturn;
-}
-
-bool_t  CVpFrameWork::CheckValid (EMethods eMethod, SPixMap& pSrcPixMap, SPixMap& pDstPixMap) {
-  bool_t eReturn = FALSE;
-
-  if (eMethod == METHOD_NULL)
-    goto exit;
-
-  if (eMethod != METHOD_COLORSPACE_CONVERT) {
-    if (pSrcPixMap.pPixel[0]) {
-      if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
-        goto exit;
-    }
-    if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0]) {
-      if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
-        goto exit;
-    }
-  }
-
-  if (pSrcPixMap.pPixel[0]) {
-    if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0
-        || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
-      goto exit;
-    if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight
-        || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
-      goto exit;
-  }
-  if (pDstPixMap.pPixel[0]) {
-    if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0
-        || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
-      goto exit;
-    if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight
-        || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
-      goto exit;
-  }
-  eReturn = TRUE;
-
-exit:
-  return eReturn;
-}
-
-IStrategy* CVpFrameWork::CreateStrategy (EMethods m_eMethod, int32_t iCpuFlag) {
-  IStrategy* pStrategy = NULL;
-
-  switch (m_eMethod) {
-  case METHOD_COLORSPACE_CONVERT:
-    //not support yet
-    break;
-  case METHOD_DENOISE:
-    pStrategy = WelsDynamicCast (IStrategy*, new CDenoiser (iCpuFlag));
-    break;
-  case METHOD_SCENE_CHANGE_DETECTION:
-    pStrategy = WelsDynamicCast (IStrategy*, new CSceneChangeDetection (iCpuFlag));
-    break;
-  case METHOD_DOWNSAMPLE:
-    pStrategy = WelsDynamicCast (IStrategy*, new CDownsampling (iCpuFlag));
-    break;
-  case METHOD_VAA_STATISTICS:
-    pStrategy = WelsDynamicCast (IStrategy*, new CVAACalculation (iCpuFlag));
-    break;
-  case METHOD_BACKGROUND_DETECTION:
-    pStrategy = WelsDynamicCast (IStrategy*, new CBackgroundDetection (iCpuFlag));
-    break;
-  case METHOD_ADAPTIVE_QUANT:
-    pStrategy = WelsDynamicCast (IStrategy*, new CAdaptiveQuantization (iCpuFlag));
-    break;
-  case METHOD_COMPLEXITY_ANALYSIS:
-    pStrategy = WelsDynamicCast (IStrategy*, new CComplexityAnalysis (iCpuFlag));
-    break;
-  case METHOD_IMAGE_ROTATE:
-    pStrategy = WelsDynamicCast (IStrategy*, new CImageRotating (iCpuFlag));
-    break;
-  default:
-    break;
-  }
-
-  return pStrategy;
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/common/WelsFrameWork.h
+++ /dev/null
@@ -1,130 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  WelsFrameWork.h
- *
- * \brief	    :  framework of wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_WELSFRAMEWORK_H
-#define WELSVP_WELSFRAMEWORK_H
-
-#include "../../interface/IWelsVP.h"
-#include "util.h"
-#include "thread.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-EResult CreateSpecificVpInterface (IWelsVP** ppCtx);
-EResult DestroySpecificVpInterface (IWelsVP* pCtx);
-
-EResult CreateSpecificVpInterface (IWelsVPc** ppCtx);
-EResult DestroySpecificVpInterface (IWelsVPc* pCtx);
-
-#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
-
-class IStrategy : public IWelsVP {
- public:
-  IStrategy() {
-    m_eMethod  = METHOD_NULL;
-    m_eFormat  = VIDEO_FORMAT_I420;
-    m_iIndex   = 0;
-    m_bInit    = FALSE;
-  };
-
-  virtual ~IStrategy() {}
-
- public:
-  virtual EResult Init (int32_t iType, void* pCfg)  {
-    return RET_SUCCESS;
-  }
-  virtual EResult Uninit (int32_t iType)              {
-    return RET_SUCCESS;
-  }
-  virtual EResult Flush (int32_t iType)               {
-    return RET_SUCCESS;
-  }
-  virtual EResult Get (int32_t iType, void* pParam) {
-    return RET_SUCCESS;
-  }
-  virtual EResult Set (int32_t iType, void* pParam) {
-    return RET_SUCCESS;
-  }
-  virtual EResult SpecialFeature (int32_t iType, void* pIn, void* pOut) {
-    return RET_SUCCESS;
-  }
-  virtual EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) = 0;
-
- public:
-  EMethods       m_eMethod;
-  EVideoFormat m_eFormat;
-  int32_t           m_iIndex;
-  bool_t            m_bInit;
-};
-
-class CVpFrameWork : public IWelsVP {
- public:
-  CVpFrameWork (uint32_t uiThreadsNum, EResult& ret);
-  ~CVpFrameWork();
-
- public:
-  EResult Init (int32_t iType, void* pCfg);
-
-  EResult Uninit (int32_t iType);
-
-  EResult Flush (int32_t iType);
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
-
-  EResult Get (int32_t iType, void* pParam);
-
-  EResult Set (int32_t iType, void* pParam);
-
-  EResult SpecialFeature (int32_t iType, void* pIn, void* pOut);
-
- private:
-  bool_t  CheckValid (EMethods eMethod, SPixMap& sSrc, SPixMap& sDst);
-  IStrategy* CreateStrategy (EMethods eMethod, int32_t iCpuFlag);
-
- private:
-  IStrategy* m_pStgChain[MAX_STRATEGY_NUM];
-
-  WELS_MUTEX m_mutes;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/WelsFrameWorkEx.cpp
+++ /dev/null
@@ -1,96 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "WelsFrameWork.h"
-
-///////////////////////////////////////////////////////////////////////
-
-WELSVP_NAMESPACE_BEGIN
-
-EResult Init (void* pCtx, int32_t iType, void* pCfg) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Init (iType, pCfg) : RET_INVALIDPARAM;
-}
-EResult Uninit (void* pCtx, int32_t iType) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Uninit (iType) : RET_INVALIDPARAM;
-}
-EResult Flush (void* pCtx, int32_t iType) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Flush (iType) : RET_INVALIDPARAM;
-}
-EResult Process (void* pCtx, int32_t iType, SPixMap* pSrc, SPixMap* dst) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Process (iType, pSrc, dst) : RET_INVALIDPARAM;
-}
-EResult Get (void* pCtx, int32_t iType, void* pParam) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Get (iType, pParam) : RET_INVALIDPARAM;
-}
-EResult Set (void* pCtx, int32_t iType, void* pParam) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Set (iType, pParam) : RET_INVALIDPARAM;
-}
-EResult SpecialFeature (void* pCtx, int32_t iType, void* pIn, void* pOut) {
-  return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->SpecialFeature (iType, pIn, pOut) : RET_INVALIDPARAM;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-EResult CreateSpecificVpInterface (IWelsVPc** pCtx) {
-  EResult  ret     = RET_FAILED;
-  IWelsVP* pWelsVP = NULL;
-
-  ret = CreateSpecificVpInterface (&pWelsVP);
-  if (ret == RET_SUCCESS) {
-    IWelsVPc* pVPc = new IWelsVPc;
-    if (pVPc) {
-      pVPc->Init    = Init;
-      pVPc->Uninit  = Uninit;
-      pVPc->Flush   = Flush;
-      pVPc->Process = Process;
-      pVPc->Get     = Get;
-      pVPc->Set     = Set;
-      pVPc->SpecialFeature = SpecialFeature;
-      pVPc->pCtx       = WelsStaticCast (void*, pWelsVP);
-      *pCtx            = pVPc;
-    } else
-      ret = RET_OUTOFMEMORY;
-  }
-
-  return ret;
-}
-
-EResult DestroySpecificVpInterface (IWelsVPc* pCtx) {
-  if (pCtx) {
-    DestroySpecificVpInterface (WelsStaticCast (IWelsVP*, pCtx->pCtx));
-    _SafeDelete (pCtx);
-  }
-
-  return RET_SUCCESS;
-}
-
-WELSVP_NAMESPACE_END
binary files a/processing/src/common/WelsVP.aps /dev/null differ
--- a/processing/src/common/WelsVP.def
+++ /dev/null
@@ -1,36 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2011-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY		    welsvp.dll
-EXPORTS
-                CreateVpInterface    PRIVATE
-                DestroyVpInterface   PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ /dev/null
@@ -1,115 +1,0 @@
-// Microsoft Visual C++ generated resource script.
-//
-#include "resource.h"
-
-#define APSTUDIO_READONLY_SYMBOLS
-/////////////////////////////////////////////////////////////////////////////
-//
-// Generated from the TEXTINCLUDE 2 resource.
-//
-#include "windows.h"
-
-/////////////////////////////////////////////////////////////////////////////
-#undef APSTUDIO_READONLY_SYMBOLS
-
-/////////////////////////////////////////////////////////////////////////////
-// Chinese (P.R.C.) resources
-
-#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
-#ifdef _WIN32
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-#pragma code_page(936)
-#endif //_WIN32
-
-#ifdef APSTUDIO_INVOKED
-/////////////////////////////////////////////////////////////////////////////
-//
-// TEXTINCLUDE
-//
-
-1 TEXTINCLUDE
-BEGIN
-    "resource.h\0"
-END
-
-2 TEXTINCLUDE
-BEGIN
-    "#include ""windows.h""\r\n"
-    "\0"
-END
-
-3 TEXTINCLUDE
-BEGIN
-    "\r\n"
-    "\0"
-END
-
-#endif    // APSTUDIO_INVOKED
-
-#endif    // Chinese (P.R.C.) resources
-/////////////////////////////////////////////////////////////////////////////
-
-
-/////////////////////////////////////////////////////////////////////////////
-// English (U.S.) resources
-
-#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
-#ifdef _WIN32
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
-#pragma code_page(1252)
-#endif //_WIN32
-
-/////////////////////////////////////////////////////////////////////////////
-//
-// Version
-//
-
-VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,0,0,0
- PRODUCTVERSION 0,0,0,0
- FILEFLAGSMASK 0x3fL
-#ifdef _DEBUG
- FILEFLAGS 0x1L
-#else
- FILEFLAGS 0x0L
-#endif
- FILEOS 0x40004L
- FILETYPE 0x2L
- FILESUBTYPE 0x0L
-BEGIN
-    BLOCK "StringFileInfo"
-    BEGIN
-        BLOCK "040904b0"
-        BEGIN
-            VALUE "Comments", "Cisco OpenH264  video preprocessing"
-            VALUE "CompanyName", "Cisco Systems"
-            VALUE "FileDescription", "Cisco OpenH264  video preprocessing"
-            VALUE "FileVersion", "0, 0, 0, 0"
-            VALUE "InternalName", "welsvp.dll"
-            VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
-            VALUE "OriginalFilename", "welsvp.dll"
-            VALUE "ProductName", "Cisco OpenH264 video preprocessing"
-            VALUE "ProductVersion", "0, 0, 0, 0"
-        END
-    END
-    BLOCK "VarFileInfo"
-    BEGIN
-        VALUE "Translation", 0x409, 1200
-    END
-END
-
-#endif    // English (U.S.) resources
-/////////////////////////////////////////////////////////////////////////////
-
-
-
-#ifndef APSTUDIO_INVOKED
-/////////////////////////////////////////////////////////////////////////////
-//
-// Generated from the TEXTINCLUDE 3 resource.
-//
-
-
-/////////////////////////////////////////////////////////////////////////////
-#endif    // not APSTUDIO_INVOKED
-
--- a/processing/src/common/cpu.cpp
+++ /dev/null
@@ -1,196 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	cpu.c
- *
- * \brief	CPU compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#include "util.h"
-#include "cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define    CPU_Vender_AMD    "AuthenticAMD"
-#define    CPU_Vender_INTEL  "GenuineIntel"
-#define    CPU_Vender_CYRIX  "CyrixInstead"
-
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
-  uint32_t uiCPU = 0;
-  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
-  int32_t  CacheLineSize = 0;
-  int8_t   chVenderName[16] = { 0 };
-
-  if (!WelsCPUIdVerify()) {
-    /* cpuid is not supported in cpu */
-    return 0;
-  }
-
-  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
-  if (uiFeatureA == 0) {
-    /* maximum input value for basic cpuid information */
-    return 0;
-  }
-
-  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-  if ((uiFeatureD & 0x00800000) == 0) {
-    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
-    return 0;
-  }
-
-  uiCPU = WELS_CPU_MMX;
-  if (uiFeatureD & 0x02000000) {
-    /* SSE technology is identical to AMD MMX extensions */
-    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
-  }
-  if (uiFeatureD & 0x04000000) {
-    /* SSE2 support here */
-    uiCPU |= WELS_CPU_SSE2;
-  }
-  if (uiFeatureD & 0x00000001) {
-    /* x87 FPU on-chip checking */
-    uiCPU |= WELS_CPU_FPU;
-  }
-  if (uiFeatureD & 0x00008000) {
-    /* CMOV instruction checking */
-    uiCPU |= WELS_CPU_CMOV;
-  }
-  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
-    if (uiFeatureD & 0x10000000) {
-      /* Multi-Threading checking: contains of multiple logic processors */
-      uiCPU |= WELS_CPU_HTT;
-    }
-  }
-
-  if (uiFeatureC & 0x00000001) {
-    /* SSE3 support here */
-    uiCPU |= WELS_CPU_SSE3;
-  }
-  if (uiFeatureC & 0x00000200) {
-    /* SSSE3 support here */
-    uiCPU |= WELS_CPU_SSSE3;
-  }
-  if (uiFeatureC & 0x00080000) {
-    /* SSE4.1 support here, 45nm Penryn processor */
-    uiCPU |= WELS_CPU_SSE41;
-  }
-  if (uiFeatureC & 0x00100000) {
-    /* SSE4.2 support here, next generation Nehalem processor */
-    uiCPU |= WELS_CPU_SSE42;
-  }
-  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {	//
-    /* AVX supported */
-    uiCPU |= WELS_CPU_AVX;
-  }
-  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {	//
-    /* AVX FMA supported */
-    uiCPU |= WELS_CPU_FMA;
-  }
-  if (uiFeatureC & 0x02000000) {
-    /* AES checking */
-    uiCPU |= WELS_CPU_AES;
-  }
-  if (uiFeatureC & 0x00400000) {
-    /* MOVBE checking */
-    uiCPU |= WELS_CPU_MOVBE;
-  }
-
-  if (pNumberOfLogicProcessors != NULL) {
-    // HTT enabled on chip
-    *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
-  }
-
-  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
-      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
-    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-    if (uiFeatureD & 0x00400000) {
-      uiCPU |= WELS_CPU_MMXEXT;
-    }
-    if (uiFeatureD & 0x80000000) {
-      uiCPU |= WELS_CPU_3DNOW;
-    }
-  }
-
-  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
-    int32_t  family, model;
-
-    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
-    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
-
-    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
-      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
-    }
-  }
-
-  // get cache line size
-  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
-      || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) {	// confirmed_safe_unsafe_usage
-    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-    CacheLineSize = (uiFeatureB & 0xff00) >>
-                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
-    if (CacheLineSize == 128) {
-      uiCPU |= WELS_CPU_CACHELINE_128;
-    } else if (CacheLineSize == 64) {
-      uiCPU |= WELS_CPU_CACHELINE_64;
-    } else if (CacheLineSize == 32) {
-      uiCPU |= WELS_CPU_CACHELINE_32;
-    } else if (CacheLineSize == 16) {
-      uiCPU |= WELS_CPU_CACHELINE_16;
-    }
-  }
-
-  return uiCPU;
-}
-
-
-void WelsCPURestore (const uint32_t kuiCPU) {
-  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
-    WelsEmms();
-  }
-}
-
-#endif
-
-
-WELSVP_NAMESPACE_END
-
-
--- a/processing/src/common/cpu.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	cpu.h
- *
- * \brief	CPU feature compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_CPU_H
-#define WELSVP_CPU_H
-
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-/*
- *	WELS CPU feature flags
- */
-#define WELS_CPU_MMX        0x00000001    /* mmx */
-#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
-#define WELS_CPU_SSE        0x00000004    /* sse */
-#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
-#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
-#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
-#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
-#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
-#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
-#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
-#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
-
-/* CPU features application extensive */
-#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
-#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
-#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature: 
-										   physical processor package is capable of supporting more than one logic processor
-										*/
-#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
-										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
-										*/
-#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
-#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
-#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
-
-#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
-#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
-#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
-#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
-
-/*
- *	Interfaces for CPU core feature detection as below
- */
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-
-int32_t WelsCPUIdVerify();
-
-void  WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
-int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
-int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
-
-void  WelsEmms();
-
-WELSVP_EXTERN_C_END
-#endif
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/memory.cpp
+++ /dev/null
@@ -1,117 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "memory.h"
-
-WELSVP_NAMESPACE_BEGIN
-/////////////////////////////////////////////////////////////////////////////////
-
-void* WelsMalloc (const uint32_t kuiSize, str_t* pTag) {
-  const int32_t kiSizeVoidPointer	= sizeof (void**);
-  const int32_t kiSizeInt32		= sizeof (int32_t);
-  const int32_t kiAlignedBytes	= ALIGNBYTES - 1;
-
-  uint8_t* pBuf		= (uint8_t*) ::malloc (kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
-  uint8_t* pAlignedBuf = NULL;
-
-  if (NULL == pBuf)
-    return NULL;
-
-  // to fill zero values
-  WelsMemset (pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
-
-  pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
-  pAlignedBuf -= WelsCastFromPointer (pAlignedBuf) & kiAlignedBytes;
-  * ((void**) (pAlignedBuf - kiSizeVoidPointer)) = pBuf;
-  * ((int32_t*) (pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32))) = kuiSize;
-
-  return (pAlignedBuf);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void WelsFree (void* pPointer, str_t* pTag) {
-  if (pPointer) {
-    ::free (* (((void**) pPointer) - 1));
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void* InternalReallocate (void* pPointer, const uint32_t kuiSize, str_t* pTag) {
-  uint32_t iOldSize = 0;
-  uint8_t* pNew = NULL;
-  if (pPointer != NULL)
-    iOldSize = * ((int32_t*) ((uint8_t*) pPointer - sizeof (void**) - sizeof (int32_t)));
-  else
-    return WelsMalloc (kuiSize, pTag);
-
-  pNew = (uint8_t*)WelsMalloc (kuiSize, pTag);
-  if (0 == pNew) {
-    if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
-      return (pPointer);
-    return 0;
-  } else if (iOldSize > 0 && kuiSize > 0)
-    memcpy (pNew, pPointer, (iOldSize < kuiSize) ? iOldSize : kuiSize);
-  else
-    return 0;
-
-  WelsFree (pPointer, pTag);
-  return (pNew);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag) {
-  const uint32_t kuiOldSize = *pRealSize;
-  uint32_t kuiNewSize = 0;
-  void* pLocalPointer = NULL;
-  if (kuiOldSize >= kuiSize)	// large enough of original block, so do nothing
-    return (pPointer);
-
-  // new request
-  kuiNewSize = kuiSize + 15;
-  kuiNewSize -= (kuiNewSize & 15);
-  kuiNewSize += 32;
-
-  pLocalPointer = InternalReallocate (pPointer, kuiNewSize, pTag);
-  if (NULL != pLocalPointer) {
-    *pRealSize	= kuiNewSize;
-    return (pLocalPointer);
-  } else {
-    return NULL;
-  }
-
-  return NULL;	// something wrong
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/common/memory.h
+++ /dev/null
@@ -1,110 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  memory.h
- *
- * \brief	    :  memory definition for wels video processor class
- *
- * \date        :  2011/02/22
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_MEMORY_H
-#define WELSVP_MEMORY_H
-
-#include "util.h"
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-inline_t void* WelsMemset (void* pPointer, int32_t iValue, uint32_t uiSize) {
-  return ::memset (pPointer, iValue, uiSize);
-}
-
-inline_t void* WelsMemcpy (void* pDst, const void* kpSrc, uint32_t uiSize) {
-  return ::memcpy (pDst, kpSrc, uiSize);
-}
-
-inline_t int32_t WelsMemcmp (const void* kpBuf1, const void* kpBuf2, uint32_t uiSize) {
-  return ::memcmp (kpBuf1, kpBuf2, uiSize);
-}
-
-/*!
-*************************************************************************************
-* \brief	malloc with zero filled utilization in Wels
-*
-* \param 	i_size	uiSize of memory block required
-*
-* \return	allocated memory pointer exactly, failed in case of NULL return
-*
-* \note	N/A
-*************************************************************************************
-*/
-void* WelsMalloc (const uint32_t kuiSize, str_t* pTag = NULL);
-
-/*!
-*************************************************************************************
-* \brief	free utilization in Wels
-*
-* \param 	p	data pointer to be free.
-*			i.e, uint8_t *p = actual data to be free, argv = &p.
-*
-* \return	NONE
-*
-* \note	N/A
-*************************************************************************************
-*/
-void WelsFree (void* pPointer, str_t* pTag = NULL);
-
-/*!
-*************************************************************************************
-* \brief	reallocation in Wels. Do nothing and continue using old block
-*		in case the block is large enough currently
-*
-* \param 	p	    memory block required in old time
-* \param	i_size	new uiSize of memory block requested
-* \param	sz_real	pointer to the old uiSize of memory block
-*
-* \return	reallocated memory pointer exactly, failed in case of NULL return
-*
-* \note	N/A
-*************************************************************************************
-*/
-void* WelsRealloc (void*  pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag = NULL);
-
-//////////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
-
-#endif
-
-
--- a/processing/src/common/resource.h
+++ /dev/null
@@ -1,15 +1,0 @@
-//{{NO_DEPENDENCIES}}
-// Microsoft Visual C++ generated include file.
-// Used by WelsVP.rc
-//
-
-// Next default values for new objects
-//
-#ifdef APSTUDIO_INVOKED
-#ifndef APSTUDIO_READONLY_SYMBOLS
-#define _APS_NEXT_RESOURCE_VALUE        101
-#define _APS_NEXT_COMMAND_VALUE         40001
-#define _APS_NEXT_CONTROL_VALUE         1000
-#define _APS_NEXT_SYMED_VALUE           101
-#endif
-#endif
--- a/processing/src/common/thread.cpp
+++ /dev/null
@@ -1,93 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	thread.cpp
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "thread.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(_WIN32)
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
-  InitializeCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
-  EnterCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
-  LeaveCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
-  DeleteCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-#elif  defined(__GNUC__)
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
-  return pthread_mutex_init (mutex, NULL);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
-  return pthread_mutex_lock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
-  return pthread_mutex_unlock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
-  return pthread_mutex_destroy (mutex);
-}
-
-#endif
-
-WELSVP_NAMESPACE_END
-
-
-
--- a/processing/src/common/thread.h
+++ /dev/null
@@ -1,89 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	thread.h
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_THREAD_H
-#define WELSVP_THREAD_H
-
-#include "typedef.h"
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-#elif defined(__GNUC__)
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <semaphore.h>
-#include <signal.h>
-#include <errno.h>
-
-#endif//WIN32
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(_WIN32)
-
-typedef  HANDLE            WELS_THREAD_HANDLE;
-typedef  CRITICAL_SECTION  WELS_MUTEX;
-
-#elif defined(__GNUC__)
-
-typedef   pthread_t         WELS_THREAD_HANDLE;
-typedef   pthread_mutex_t   WELS_MUTEX;
-
-#endif
-
-typedef long_t WELS_THREAD_ERROR_CODE;
-
-#define   WELS_THREAD_ERROR_OK					0
-#define   WELS_THREAD_ERROR_GENERIAL			((unsigned long)(-1))
-#define   WELS_THREAD_ERROR_WAIT_OBJECT_0		0
-#define	  WELS_THREAD_ERROR_WAIT_TIMEOUT		((unsigned long)0x00000102L)
-#define	  WELS_THREAD_ERROR_WAIT_FAILED		    WELS_THREAD_ERROR_GENERIAL
-
-WELS_THREAD_ERROR_CODE   WelsMutexInit (WELS_MUTEX*    mutex);
-WELS_THREAD_ERROR_CODE   WelsMutexLock (WELS_MUTEX*    mutex);
-WELS_THREAD_ERROR_CODE   WelsMutexUnlock (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE   WelsMutexDestroy (WELS_MUTEX* mutex);
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/typedef.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  typedef.h
- *
- * \brief	    :  basic type definition
- *
- * \date        :  2011/01/04
- *
- * \description :  1. Define basic type with platform-independent;
- *                 2. Define specific namespace to avoid name pollution;
- *                 3. C++ ONLY;
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_TYPEDEF_H
-#define WELSVP_TYPEDEF_H
-
-#define WELSVP_EXTERN_C_BEGIN       extern "C" {
-#define WELSVP_EXTERN_C_END         }
-
-#define WELSVP_NAMESPACE_BEGIN      namespace nsWelsVP {
-#define WELSVP_NAMESPACE_END        }
-
-WELSVP_NAMESPACE_BEGIN
-
-#if ( defined(_WIN32) || defined(_WIN32) ) && defined(_MSC_VER)
-
-typedef char               int8_t   ;
-typedef unsigned char      uint8_t  ;
-typedef short              int16_t  ;
-typedef unsigned short     uint16_t ;
-typedef int                int32_t  ;
-typedef unsigned int       uint32_t ;
-typedef __int64            int64_t  ;
-typedef unsigned __int64   uint64_t ;
-#define inline_t           _inline
-
-#else	// GCC
-
-typedef signed char        int8_t
-; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit
-typedef unsigned char      uint8_t  ;
-typedef signed short       int16_t  ;
-typedef unsigned short     uint16_t ;
-typedef signed int         int32_t  ;
-typedef unsigned int       uint32_t ;
-typedef long long          int64_t  ;
-typedef unsigned long long uint64_t ;
-#define inline_t           inline
-
-#endif
-
-typedef char    str_t    ; // [comment]: specific use plain char only for character parameters
-typedef long    long_t   ;
-typedef int32_t bool_t   ;
-
-#if defined(_WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
-typedef float   float_t  ;
-typedef double  double_t ;
-#endif
-
-#ifndef NULL
-#define NULL    0
-#endif
-
-enum {
-  FALSE = 0,
-  TRUE  = !FALSE
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/util.cpp
+++ /dev/null
@@ -1,45 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "util.h"
-
-WELSVP_NAMESPACE_BEGIN
-/////////////////////////////////////////////////////////////////////////////////
-
-
-int32_t  WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2) {
-  return ::strcmp (kpStr1, kpStr2);
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
--- a/processing/src/common/util.h
+++ /dev/null
@@ -1,107 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  util.h
- *
- * \brief	    :  utils for wels video processor class
- *
- * \date        :  2011/01/04
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_UTIL_H
-#define WELSVP_UTIL_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include "typedef.h"
-#include "memory.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define MAX_WIDTH      (4096)
-#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
-#define MB_WIDTH_LUMA  (16)
-#define PESN		   (1e-6)	// desired float precision
-
-#define MB_TYPE_INTRA4x4		0x00000001
-#define MB_TYPE_INTRA16x16	0x00000002
-#define MB_TYPE_INTRA_PCM		0x00000004
-#define MB_TYPE_INTRA			  (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
-#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
-
-#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
-#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
-#define WELS_SIGN(a)	((long_t)(a) >> 31)
-#define WELS_ABS(a)		((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
-#define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)
-
-#define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */
-#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
-#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)
-#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)
-#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)
-
-#define WelsCastFromPointer(p)      (reinterpret_cast<long_t>(p))
-#define WelsStaticCast(type, p)  (static_cast<type>(p))
-#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
-
-#define GET_METHOD(x)  ((x) & 0xff)          // mask method as the lowest 8bits
-#define GET_SPECIAL(x) (((x) >> 8) & 0xff)   // mask special flag as 8bits
-
-inline_t EMethods WelsVpGetValidMethod (int32_t a) {
-  int32_t iMethod = GET_METHOD (a);
-  return WelsStaticCast (EMethods, WELS_CLAMP (iMethod, METHOD_NULL + 1, METHOD_MASK - 1));
-}
-
-
-#define _SafeFree(p)		if (p) { WelsFree(p); (p) = NULL; }
-#define _SafeDelete(p)		if (p) { delete (p); (p) = NULL; }
-
-
-//////////////////////////////////////////////////////////////////////////////////////
-
-int32_t   WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2);
-
-
-//////////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
-
-#endif
-
-
--- a/processing/src/complexityanalysis/ComplexityAnalysis.cpp
+++ /dev/null
@@ -1,304 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "ComplexityAnalysis.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
-  m_eMethod   = METHOD_COMPLEXITY_ANALYSIS;
-  m_pfGomSad   = NULL;
-  WelsMemset (&m_sComplexityAnalysisParam, 0, sizeof (m_sComplexityAnalysisParam));
-}
-
-CComplexityAnalysis::~CComplexityAnalysis() {
-}
-
-EResult CComplexityAnalysis::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  EResult eReturn = RET_SUCCESS;
-
-  switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode) {
-  case FRAME_SAD:
-    AnalyzeFrameComplexityViaSad (pSrcPixMap, pRefPixMap);
-    break;
-  case GOM_SAD:
-    AnalyzeGomComplexityViaSad (pSrcPixMap, pRefPixMap);
-    break;
-  case GOM_VAR:
-    AnalyzeGomComplexityViaVar (pSrcPixMap, pRefPixMap);
-    break;
-  default:
-    eReturn = RET_INVALIDPARAM;
-    break;
-  }
-
-  return eReturn;
-}
-
-
-EResult CComplexityAnalysis::Set (int32_t iType, void* pParam) {
-  if (pParam == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  m_sComplexityAnalysisParam = * (SComplexityAnalysisParam*)pParam;
-
-  return RET_SUCCESS;
-}
-
-EResult CComplexityAnalysis::Get (int32_t iType, void* pParam) {
-  if (pParam == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  SComplexityAnalysisParam* sComplexityAnalysisParam = (SComplexityAnalysisParam*)pParam;
-
-  sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
-
-  return RET_SUCCESS;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-void CComplexityAnalysis::AnalyzeFrameComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  SVAACalcResult*     pVaaCalcResults = NULL;
-  pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-
-  m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
-
-  if (m_sComplexityAnalysisParam.iCalcBgd) { //BGD control
-    m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground (pSrcPixMap, pRefPixMap);
-  }
-}
-
-int32_t CComplexityAnalysis::GetFrameSadExcludeBackground (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
-  int32_t iMbWidth  = iWidth  >> 4;
-  int32_t iMbHeight = iHeight >> 4;
-  int32_t iMbNum    = iMbWidth * iMbHeight;
-
-  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
-  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
-  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
-
-  uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
-  uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
-  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-  int32_t*  pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
-
-  uint32_t uiFrameSad = 0;
-  for (int32_t j = 0; j < iGomMbNum; j ++) {
-    iGomMbStartIndex = j * iMbNumInGom;
-    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
-
-    for (int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++) {
-      if (pBackgroundMbFlag[i] == 0 || IS_INTRA (uiRefMbType[i])) {
-        pGomForegroundBlockNum[j]++;
-        uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
-        uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
-        uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
-        uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
-      }
-    }
-  }
-
-  return (uiFrameSad);
-}
-
-
-void InitGomSadFunc (PGOMSadFunc& pfGomSad, uint8_t iCalcBgd) {
-  pfGomSad = GomSampleSad;
-
-  if (iCalcBgd) {
-    pfGomSad = GomSampleSadExceptBackground;
-  }
-}
-
-void GomSampleSad (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8, uint8_t pBackgroundMbFlag) {
-  (*pGomForegroundBlockNum) ++;
-  *pGomSad += pSad8x8[0];
-  *pGomSad += pSad8x8[1];
-  *pGomSad += pSad8x8[2];
-  *pGomSad += pSad8x8[3];
-}
-
-void GomSampleSadExceptBackground (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
-                                   uint8_t pBackgroundMbFlag) {
-  if (pBackgroundMbFlag == 0) {
-    (*pGomForegroundBlockNum) ++;
-    *pGomSad += pSad8x8[0];
-    *pGomSad += pSad8x8[1];
-    *pGomSad += pSad8x8[2];
-    *pGomSad += pSad8x8[3];
-  }
-}
-
-void CComplexityAnalysis::AnalyzeGomComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
-  int32_t iMbWidth  = iWidth  >> 4;
-  int32_t iMbHeight = iHeight >> 4;
-  int32_t iMbNum    = iMbWidth * iMbHeight;
-
-  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
-  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
-
-  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
-  int32_t iMbStartIndex = 0, iMbEndIndex = 0;
-  int32_t iStartSampleIndex = 0;
-
-  uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
-  uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
-  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-  int32_t*  pGomForegroundBlockNum = (int32_t*)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
-  int32_t*  pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
-
-  uint8_t* pRefY = NULL, *pSrcY = NULL;
-  int32_t iRefStride = 0, iCurStride = 0;
-
-  uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
-  uint32_t uiGomSad = 0, uiFrameSad = 0;
-
-  pRefY = (uint8_t*)pRefPixMap->pPixel[0];
-  pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
-
-  iRefStride  = pRefPixMap->iStride[0];
-  iCurStride  = pSrcPixMap->iStride[0];
-
-  InitGomSadFunc (m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd);
-
-  for (int32_t j = 0; j < iGomMbNum; j ++) {
-    uiGomSad = 0;
-
-    iGomMbStartIndex = j * iMbNumInGom;
-    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
-    iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth  - iGomMbStartIndex / iMbWidth;
-
-    iMbStartIndex = iGomMbStartIndex;
-    iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
-
-    iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
-                         MB_WIDTH_LUMA;
-
-    do {
-      pRefTmp = pRefY + iStartSampleIndex;
-      pCurTmp = pSrcY + iStartSampleIndex;
-
-      for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
-        m_pfGomSad (&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i]
-                    && !IS_INTRA (uiRefMbType[i]));
-      }
-
-      iMbStartIndex = iMbEndIndex;
-      iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth , iGomMbEndIndex);
-
-      iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
-                           MB_WIDTH_LUMA;
-
-    } while (--iGomMbRowNum);
-
-    pGomComplexity[j] = uiGomSad;
-    uiFrameSad += pGomComplexity[j];
-  }
-
-  m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
-}
-
-
-void CComplexityAnalysis::AnalyzeGomComplexityViaVar (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
-  int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;
-  int32_t iMbWidth  = iWidth  >> 4;
-  int32_t iMbHeight = iHeight >> 4;
-  int32_t iMbNum    = iMbWidth * iMbHeight;
-
-  int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
-  int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
-  int32_t iGomSampleNum = 0;
-
-  int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
-  int32_t iMbStartIndex = 0, iMbEndIndex = 0;
-  int32_t iStartSampleIndex = 0;
-
-  SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-  int32_t*  pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
-
-  uint8_t* pSrcY = NULL;
-  int32_t iCurStride = 0;
-
-  uint8_t* pCurTmp = NULL;
-  uint32_t uiSampleSum = 0, uiSquareSum = 0;
-
-  pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
-  iCurStride  = pSrcPixMap->iStride[0];
-
-  for (int32_t j = 0; j < iGomMbNum; j ++) {
-    uiSampleSum = 0;
-    uiSquareSum = 0;
-
-    iGomMbStartIndex = j * iMbNumInGom;
-    iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
-    iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth  - iGomMbStartIndex / iMbWidth;
-
-    iMbStartIndex = iGomMbStartIndex;
-    iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
-
-    iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
-                         MB_WIDTH_LUMA;
-    iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
-
-    do {
-      pCurTmp = pSrcY + iStartSampleIndex;
-
-      for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
-        uiSampleSum += pVaaCalcResults->pSum16x16[i];
-        uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
-      }
-
-      iMbStartIndex = iMbEndIndex;
-      iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth, iGomMbEndIndex);
-
-      iStartSampleIndex  = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
-                           MB_WIDTH_LUMA;
-    } while (--iGomMbRowNum);
-
-    pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
-  }
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/complexityanalysis/ComplexityAnalysis.h
+++ /dev/null
@@ -1,83 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
-* \file	        :  ComplexityAnalysis.h
-*
-* \brief	    :  complexity analysis class of wels video processor class
-*
-* \date         :  2011/03/28
-*
-* \description  :  1. rewrite the package code of complexity analysis class
-*
-*************************************************************************************
-*/
-
-#ifndef WELSVP_COMPLEXITYANALYSIS_H
-#define WELSVP_COMPLEXITYANALYSIS_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef  void (GOMSadFunc) (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
-                            uint8_t pBackgroundMbFlag);
-
-typedef GOMSadFunc*   PGOMSadFunc;
-
-GOMSadFunc      GomSampleSad;
-GOMSadFunc      GomSampleSadExceptBackground;
-
-class CComplexityAnalysis : public IStrategy {
- public:
-  CComplexityAnalysis (int32_t iCpuFlag);
-  ~CComplexityAnalysis();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
-  EResult Set (int32_t iType, void* pParam);
-  EResult Get (int32_t iType, void* pParam);
-
- private:
-  void AnalyzeFrameComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
-  int32_t GetFrameSadExcludeBackground (SPixMap* pSrc, SPixMap* pRef);
-
-  void AnalyzeGomComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
-  void AnalyzeGomComplexityViaVar (SPixMap* pSrc, SPixMap* pRef);
-
- private:
-  PGOMSadFunc m_pfGomSad;
-  SComplexityAnalysisParam m_sComplexityAnalysisParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/denoise/denoise.cpp
+++ /dev/null
@@ -1,124 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "denoise.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define CALC_BI_STRIDE(iWidth, iBitcount)  ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CDenoiser::CDenoiser (int32_t iCpuFlag) {
-  m_CPUFlag = iCpuFlag;
-  m_eMethod   = METHOD_DENOISE;
-  WelsMemset (&m_pfDenoise, 0, sizeof (m_pfDenoise));
-
-  m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
-  m_fSigmaGrey  = DENOISE_GRAY_SIGMA;
-  m_uiType		 = DENOISE_ALL_COMPONENT;
-  InitDenoiseFunc (m_pfDenoise, m_CPUFlag);
-}
-
-CDenoiser::~CDenoiser() {
-}
-
-void CDenoiser::InitDenoiseFunc (SDenoiseFuncs& denoiser,  int32_t iCpuFlag) {
-  denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
-  denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
-#if defined(X86_ASM)
-  if (iCpuFlag & WELS_CPU_SSE2) {
-    denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
-    denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;
-  }
-#endif
-}
-
-EResult CDenoiser::Process (int32_t iType, SPixMap* pSrc, SPixMap* dst) {
-  uint8_t* pSrcY = (uint8_t*)pSrc->pPixel[0];
-  uint8_t* pSrcU = (uint8_t*)pSrc->pPixel[1];
-  uint8_t* pSrcV = (uint8_t*)pSrc->pPixel[2];
-  if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  int32_t iWidthY = pSrc->sRect.iRectWidth;
-  int32_t iHeightY = pSrc->sRect.iRectHeight;
-  int32_t iWidthUV = iWidthY >> 1;
-  int32_t iHeightUV = iHeightY >> 1;
-
-  if (m_uiType & DENOISE_Y_COMPONENT)
-    BilateralDenoiseLuma (pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
-
-  if (m_uiType & DENOISE_U_COMPONENT)
-    WaverageDenoiseChroma (pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
-
-  if (m_uiType & DENOISE_V_COMPONENT)
-    WaverageDenoiseChroma (pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
-
-  return RET_SUCCESS;
-}
-
-void CDenoiser::BilateralDenoiseLuma (uint8_t* pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride) {
-  int32_t w;
-
-  pSrcY = pSrcY + m_uiSpaceRadius * iStride;
-  for (int32_t h = m_uiSpaceRadius; h < iHeight - m_uiSpaceRadius; h++) {
-    for (w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w += 8) {
-      m_pfDenoise.pfBilateralLumaFilter8 (pSrcY + w, iStride);
-    }
-    for (w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++) {
-      Gauss3x3Filter (pSrcY + w, iStride);
-    }
-    pSrcY += iStride;
-  }
-}
-
-void CDenoiser::WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride) {
-  int32_t w;
-
-  pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
-  for (int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++) {
-    for (w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w += 8) {
-      m_pfDenoise.pfWaverageChromaFilter8 (pSrcUV + w, iStride);
-    }
-
-    for (w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++) {
-      Gauss3x3Filter (pSrcUV + w, iStride);
-    }
-    pSrcUV += iStride;
-  }
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/denoise/denoise.h
+++ /dev/null
@@ -1,111 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  denoise.h
- *
- * \brief	    :  denoise class of wels video processor class
- *
- * \date        :  2011/03/15
- *
- * \description :  1. rewrite the package code of denoise class
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_DENOISE_H
-#define WELSVP_DENOISE_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-
-#define DENOISE_GRAY_RADIUS (1)
-#define DENOISE_GRAY_SIGMA  (2)
-
-#define UV_WINDOWS_RADIUS   (2)
-#define TAIL_OF_LINE8		(7)
-
-#define DENOISE_Y_COMPONENT (1)
-#define DENOISE_U_COMPONENT (2)
-#define DENOISE_V_COMPONENT (4)
-#define DENOISE_ALL_COMPONENT (7)
-
-
-WELSVP_NAMESPACE_BEGIN
-
-void Gauss3x3Filter (uint8_t* pixels, int32_t stride);
-
-typedef void (DenoiseFilterFunc) (uint8_t* pixels, int32_t stride);
-
-typedef DenoiseFilterFunc* DenoiseFilterFuncPtr;
-
-DenoiseFilterFunc     BilateralLumaFilter8_c;
-DenoiseFilterFunc     WaverageChromaFilter8_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-DenoiseFilterFunc     BilateralLumaFilter8_sse2 ;
-DenoiseFilterFunc     WaverageChromaFilter8_sse2 ;
-WELSVP_EXTERN_C_END
-#endif
-
-typedef  struct TagDenoiseFuncs {
-  DenoiseFilterFuncPtr	pfBilateralLumaFilter8;//on 8 samples
-  DenoiseFilterFuncPtr	pfWaverageChromaFilter8;//on 8 samples
-} SDenoiseFuncs;
-
-class CDenoiser : public IStrategy {
- public:
-  CDenoiser (int32_t iCpuFlag);
-  ~CDenoiser();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* dst);
-
- private:
-  void InitDenoiseFunc (SDenoiseFuncs& pf, int32_t cpu);
-  void BilateralDenoiseLuma (uint8_t* p_y_data, int32_t width, int32_t height, int32_t stride);
-  void WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t width, int32_t height, int32_t stride);
-
- private:
-  float_t	 m_fSigmaGrey;			//sigma for grey scale similarity, suggestion 2.5-3
-  uint32_t  m_uiFilterWindow;				//filter window diameter
-  uint16_t	 m_uiSpaceRadius;			//filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
-  uint16_t	 m_uiType;					//do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
-  uint32_t*  m_pGreyWeightTable;		//weight table for grey scale
-
-  SDenoiseFuncs m_pfDenoise;
-  int32_t      m_CPUFlag;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/denoise/denoise_filter.cpp
+++ /dev/null
@@ -1,127 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2010-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	svc_preprocess.h
- *
- * \brief	svc denoising
- *
- * \date	4/1/2010 Created
- *
- */
-
-#include "denoise.h"
-#include "../common/typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void	BilateralLumaFilter8_c (uint8_t* pSample, int32_t iStride) {
-  int32_t nSum = 0, nTotWeight = 0;
-  int32_t iCenterSample = *pSample;
-  uint8_t* pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
-  int32_t x, y;
-  int32_t iCurSample, iCurWeight, iGreyDiff;
-  uint8_t aSample[8];
-
-  for (int32_t i = 0; i < 8; i++) {
-    nSum = 0;
-    nTotWeight = 0;
-    iCenterSample = *pSample;
-    pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
-    for (y = 0; y < 3; y++) {
-      for (x = 0; x < 3; x++) {
-        if (x == 1 && y == 1) continue;			// except center point
-        iCurSample = pCurLine[x];
-        iCurWeight = WELS_ABS (iCurSample - iCenterSample);
-        iGreyDiff = 32 - iCurWeight;
-        if (iGreyDiff < 0)	continue;
-        else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
-        nSum += iCurSample * iCurWeight;
-        nTotWeight +=  iCurWeight;
-      }
-      pCurLine += iStride;
-    }
-    nTotWeight = 256 - nTotWeight;
-    nSum += iCenterSample * nTotWeight;
-    aSample[i] = nSum >> 8;
-    pSample++;
-  }
-  WelsMemcpy (pSample - 8, aSample, 8);
-}
-
-
-/***************************************************************************
-5x5 filter:
-1	1	2	1	1
-1	2	4	2	1
-2	4	20	4	2
-1	2	4	2	1
-1	1	2	1	1
-***************************************************************************/
-#define SUM_LINE1(pSample)	(pSample[0] +(pSample[1]) +(pSample[2]<<1)  + pSample[3] + pSample[4])
-#define SUM_LINE2(pSample)	(pSample[0] +(pSample[1]<<1) +(pSample[2]<<2)  +(pSample[3]<<1) +pSample[4])
-#define SUM_LINE3(pSample)	((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20)  +(pSample[3]<<2) +(pSample[4]<<1))
-void	WaverageChromaFilter8_c (uint8_t* pSample, int32_t iStride) {
-  int32_t sum;
-  uint8_t* pStartPixels = pSample - UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
-  uint8_t* pCurLine1 = pStartPixels;
-  uint8_t* pCurLine2 = pCurLine1 + iStride;
-  uint8_t* pCurLine3 = pCurLine2 + iStride;
-  uint8_t* pCurLine4 = pCurLine3 + iStride;
-  uint8_t* pCurLine5 = pCurLine4 + iStride;
-  uint8_t aSample[8];
-
-  for (int32_t i = 0; i < 8; i++) {
-    sum = SUM_LINE1 ((pCurLine1 + i)) + SUM_LINE2 ((pCurLine2 + i)) + SUM_LINE3 ((pCurLine3 + i))
-          + SUM_LINE2 ((pCurLine4 + i)) + SUM_LINE1 ((pCurLine5 + i));
-    aSample[i] = (sum >> 6);
-    pSample++;
-  }
-  WelsMemcpy (pSample - 8, aSample, 8);
-}
-
-/***************************************************************************
-edge of y/uv use a 3x3 Gauss filter, radius = 1:
-1	2	1
-2	4	2
-1	2	1
-***************************************************************************/
-void	Gauss3x3Filter (uint8_t* pSrc, int32_t iStride) {
-  int32_t nSum = 0;
-  uint8_t* pCurLine1 = pSrc - iStride - 1;
-  uint8_t* pCurLine2 = pCurLine1 + iStride;
-  uint8_t* pCurLine3 = pCurLine2 + iStride;
-
-  nSum =	 pCurLine1[0]		+ (pCurLine1[1] << 1) +  pCurLine1[2]		+
-           (pCurLine2[0] << 1)	+ (pCurLine2[1] << 2) + (pCurLine2[2] << 1) +
-           pCurLine3[0]		+ (pCurLine3[1] << 1) +  pCurLine3[2];
-  *pSrc = nSum >> 4;
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/downsample/downsample.cpp
+++ /dev/null
@@ -1,135 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "downsample.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CDownsampling::CDownsampling (int32_t iCpuFlag) {
-  m_iCPUFlag = iCpuFlag;
-  m_eMethod   = METHOD_DOWNSAMPLE;
-  WelsMemset (&m_pfDownsample, 0, sizeof (m_pfDownsample));
-  InitDownsampleFuncs (m_pfDownsample, m_iCPUFlag);
-}
-
-CDownsampling::~CDownsampling() {
-}
-
-void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc,  int32_t iCpuFlag) {
-  sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
-  sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
-  sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;
-#if defined(X86_ASM)
-  if (iCpuFlag & WELS_CPU_SSE) {
-    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
-    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
-    sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;
-  }
-  if (iCpuFlag & WELS_CPU_SSE2) {
-    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
-    sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
-  }
-  if (iCpuFlag & WELS_CPU_SSSE3) {
-    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
-    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
-  }
-  if (iCpuFlag & WELS_CPU_SSE41) {
-    sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
-    sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
-  }
-#endif//X86_ASM
-
-}
-
-EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
-  int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
-  int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
-  int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
-  int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
-
-  int32_t iSrcWidthUV = iSrcWidthY >> 1;
-  int32_t iSrcHeightUV = iSrcHeightY >> 1;
-  int32_t iDstWidthUV = iDstWidthY >> 1;
-  int32_t iDstHeightUV = iDstHeightY >> 1;
-
-  if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY) {
-    return RET_INVALIDPARAM;
-  }
-
-  if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
-    // use half average functions
-    uint8_t iAlignIndex = 3;
-
-    iAlignIndex = GetAlignedIndex (iSrcWidthY);
-    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
-        (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
-    iAlignIndex = GetAlignedIndex (iSrcWidthUV);
-    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
-        (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
-    m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
-        (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
-  } else {
-    m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
-                                       (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
-    m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
-                                         (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
-
-    m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
-                                         (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
-  }
-  return RET_SUCCESS;
-}
-
-int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
-  int32_t iAlignIndex = 3;
-  if ((kiSrcWidth & 0x1f) == 0)	// x32
-    iAlignIndex	= 0;
-  else if ((kiSrcWidth & 0x0f) == 0)	// x16
-    iAlignIndex	= 1;
-  else if ((kiSrcWidth & 0x07) == 0)	// x8
-    iAlignIndex	= 2;
-  else
-    iAlignIndex	= 3;
-  return iAlignIndex;
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/downsample/downsample.h
+++ /dev/null
@@ -1,128 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  downsample.h
- *
- * \brief	    :  downsample class of wels video processor class
- *
- * \date        :  2011/03/33
- *
- * \description :  1. rewrite the package code of downsample class
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_DOWNSAMPLE_H
-#define WELSVP_DOWNSAMPLE_H
-
-#include "../common/util.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
-                                    uint8_t* pSrc, const int32_t kiSrcStride,
-                                    const int32_t kiSrcWidth, const int32_t kiSrcHeight);
-
-typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-                                      const int32_t kiDstHeight,
-                                      uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
-
-typedef HalveDownsampleFunc*		PHalveDownsampleFunc;
-typedef GeneralDownsampleFunc*	PGeneralDownsampleFunc;
-
-HalveDownsampleFunc   DyadicBilinearDownsampler_c;
-GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
-GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
-
-typedef struct {
-  // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
-  PHalveDownsampleFunc			pfHalfAverage[4];
-  PGeneralDownsampleFunc		pfGeneralRatioLuma;
-  PGeneralDownsampleFunc		pfGeneralRatioChroma;
-} SDownsampleFuncs;
-
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-// used for scr width is multipler of 8 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx8_sse;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse;
-// used for scr width is multipler of 16 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_ssse3;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_ssse3;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse4;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse4;
-
-GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
-GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
-
-void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
-    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
-void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
-    const uint32_t kuiScaleX, const uint32_t kuiScaleY);
-WELSVP_EXTERN_C_END
-#endif
-
-
-
-
-class CDownsampling : public IStrategy {
- public:
-  CDownsampling (int32_t iCpuFlag);
-  ~CDownsampling();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
-
- private:
-  void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
-
-  int32_t GetAlignedIndex (const int32_t kiSrcWidth);
-
- private:
-  SDownsampleFuncs m_pfDownsample;
-  int32_t  m_iCPUFlag;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/downsample/downsamplefuncs.cpp
+++ /dev/null
@@ -1,234 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2008-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *  downsample_yuv.c
- *
- *  Abstract
- *      Implementation for source yuv data downsampling used before spatial encoding.
- *
- *  History
- *      10/24/2008 Created
- *
- *****************************************************************************/
-
-#include "../common/typedef.h"
-#include "../common/util.h"
-#include "downsample.h"
-
-
-WELSVP_NAMESPACE_BEGIN
-
-
-void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
-                                  uint8_t* pSrc, const int32_t kiSrcStride,
-                                  const int32_t kiSrcWidth, const int32_t kiSrcHeight)
-
-{
-  uint8_t* pDstLine	= pDst;
-  uint8_t* pSrcLine	= pSrc;
-  const int32_t kiSrcStridex2	= kiSrcStride << 1;
-  const int32_t kiDstWidth		= kiSrcWidth >> 1;
-  const int32_t kiDstHeight	= kiSrcHeight >> 1;
-
-  for (int32_t j = 0; j < kiDstHeight; j ++) {
-    for (int32_t i = 0; i < kiDstWidth; i ++) {
-      const int32_t kiSrcX = i << 1;
-      const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
-      const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
-
-      pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
-    }
-    pDstLine	+= kiDstStride;
-    pSrcLine	+= kiSrcStridex2;
-  }
-}
-
-void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-                                       const int32_t kiDstHeight,
-                                       uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
-  const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
-  int32_t fScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
-  int32_t fScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
-  uint32_t x;
-  int32_t iYInverse, iXInverse;
-
-  uint8_t* pByDst = pDst;
-  uint8_t* pByLineDst = pDst;
-
-  iYInverse = 1 << (kuiScaleBitHeight - 1);
-  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
-    int32_t iYy = iYInverse >> kuiScaleBitHeight;
-    int32_t fv = iYInverse & (kuiScaleHeight - 1);
-
-    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-    pByDst = pByLineDst;
-    iXInverse = 1 << (kuiScaleBitWidth - 1);
-    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
-      int32_t iXx = iXInverse >> kuiScaleBitWidth;
-      int32_t iFu = iXInverse & (kuiScaleWidth - 1);
-
-      uint8_t* pByCurrent = pBySrc + iXx;
-      uint8_t a, b, c, d;
-
-      a = *pByCurrent;
-      b = * (pByCurrent + 1);
-      c = * (pByCurrent + kiSrcStride);
-      d = * (pByCurrent + kiSrcStride + 1);
-
-      x  = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
-      x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
-      x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
-      x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
-      x >>= (kuiScaleBitHeight - 1);
-      x += 1;
-      x >>= 1;
-      //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c +
-      //		 ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
-      x = WELS_CLAMP (x, 0, 255);
-      *pByDst++ = (uint8_t)x;
-
-      iXInverse += fScalex;
-    }
-    *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth));
-    pByLineDst += kiDstStride;
-    iYInverse += fScaley;
-  }
-
-  // last row special
-  {
-    int32_t iYy = iYInverse >> kuiScaleBitHeight;
-    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-    pByDst = pByLineDst;
-    iXInverse = 1 << (kuiScaleBitWidth - 1);
-    for (int32_t j = 0; j < kiDstWidth; j++) {
-      int32_t iXx = iXInverse >> kuiScaleBitWidth;
-      *pByDst++ = * (pBySrc + iXx);
-
-      iXInverse += fScalex;
-    }
-  }
-}
-
-void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBit = 15;
-  const int32_t kiScale = (1 << kiScaleBit);
-  int32_t iScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
-  int32_t iScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
-  int64_t x;
-  int32_t iYInverse, iXInverse;
-
-  uint8_t* pByDst = pDst;
-  uint8_t* pByLineDst = pDst;
-
-  iYInverse = 1 << (kiScaleBit - 1);
-  for (int32_t i = 0; i < kiDstHeight - 1; i++) {
-    int32_t iYy = iYInverse >> kiScaleBit;
-    int32_t iFv = iYInverse & (kiScale - 1);
-
-    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-    pByDst = pByLineDst;
-    iXInverse = 1 << (kiScaleBit - 1);
-    for (int32_t j = 0; j < kiDstWidth - 1; j++) {
-      int32_t iXx = iXInverse >> kiScaleBit;
-      int32_t iFu = iXInverse & (kiScale - 1);
-
-      uint8_t* pByCurrent = pBySrc + iXx;
-      uint8_t a, b, c, d;
-
-      a = *pByCurrent;
-      b = * (pByCurrent + 1);
-      c = * (pByCurrent + kiSrcStride);
-      d = * (pByCurrent + kiSrcStride + 1);
-
-      x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
-             kiScale - 1 - iFu)) * iFv * c +
-           ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
-      x = WELS_CLAMP (x, 0, 255);
-      *pByDst++ = (uint8_t)x;
-
-      iXInverse += iScalex;
-    }
-    *pByDst = * (pBySrc + (iXInverse >> kiScaleBit));
-    pByLineDst += kiDstStride;
-    iYInverse += iScaley;
-  }
-
-  // last row special
-  {
-    int32_t iYy = iYInverse >> kiScaleBit;
-    uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
-    pByDst = pByLineDst;
-    iXInverse = 1 << (kiScaleBit - 1);
-    for (int32_t j = 0; j < kiDstWidth; j++) {
-      int32_t iXx = iXInverse >> kiScaleBit;
-      *pByDst++ = * (pBySrc + iXx);
-
-      iXInverse += iScalex;
-    }
-  }
-}
-
-
-#ifdef X86_ASM
-void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
-  const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
-
-  uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
-  uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
-
-  GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
-                                       pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-}
-
-void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
-    const int32_t kiDstHeight,
-    uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
-  const int32_t kiScaleBit = 15;
-  const uint32_t kuiScale = (1 << kiScaleBit);
-
-  uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
-  uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
-
-  GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
-      pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-}
-#endif //X86_ASM
-
-WELSVP_NAMESPACE_END
--- a/processing/src/imagerotate/imagerotate.cpp
+++ /dev/null
@@ -1,93 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "imagerotate.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CImageRotating::CImageRotating (int32_t iCpuFlag) {
-  m_iCPUFlag = iCpuFlag;
-  m_eMethod   = METHOD_IMAGE_ROTATE;
-  WelsMemset (&m_pfRotateImage, 0, sizeof (m_pfRotateImage));
-  InitImageRotateFuncs (m_pfRotateImage, m_iCPUFlag);
-}
-
-CImageRotating::~CImageRotating() {
-}
-
-void CImageRotating::InitImageRotateFuncs (SImageRotateFuncs& sImageRotateFuncs, int32_t iCpuFlag) {
-  sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
-  sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
-  sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
-}
-EResult CImageRotating::ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth,
-    uint32_t iHeight, uint8_t* pDst) {
-  if (iType == 90) {
-    m_pfRotateImage.pfImageRotate90D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
-  } else if (iType == 180) {
-    m_pfRotateImage.pfImageRotate180D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
-  } else if (iType == 270) {
-    m_pfRotateImage.pfImageRotate270D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
-  } else {
-    return RET_NOTSUPPORTED;
-  }
-  return RET_SUCCESS;
-}
-
-EResult CImageRotating::Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) {
-  EResult eReturn = RET_INVALIDPARAM;
-
-  if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
-      (pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
-      (pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
-      (pSrc->eFormat == VIDEO_FORMAT_ARGB)) {
-    eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
-                                  pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
-  } else if (pSrc->eFormat == VIDEO_FORMAT_I420) {
-    ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
-                        pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
-    ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[1], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
-                        (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[1]);
-    eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[2], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
-                                  (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[2]);
-  } else {
-    eReturn = RET_NOTSUPPORTED;
-  }
-
-  return eReturn;
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/imagerotate/imagerotate.h
+++ /dev/null
@@ -1,85 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  downsample.h
- *
- * \brief	    :  image rotate class of wels video processor class
- *
- * \date        :  2011/04/06
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_IMAGEROTATE_H
-#define WELSVP_IMAGEROTATE_H
-
-#include "../common/util.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (ImageRotateFunc) (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
-                                uint8_t* pDst);
-
-typedef ImageRotateFunc*		ImageRotateFuncPtr;
-
-ImageRotateFunc   ImageRotate90D_c;
-ImageRotateFunc   ImageRotate180D_c;
-ImageRotateFunc   ImageRotate270D_c;
-
-typedef struct {
-  ImageRotateFuncPtr		pfImageRotate90D;
-  ImageRotateFuncPtr		pfImageRotate180D;
-  ImageRotateFuncPtr		pfImageRotate270D;
-} SImageRotateFuncs;
-
-class CImageRotating : public IStrategy {
- public:
-  CImageRotating (int32_t iCpuFlag);
-  ~CImageRotating();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
-
- private:
-  void InitImageRotateFuncs (SImageRotateFuncs& pf, int32_t iCpuFlag);
-  EResult ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
-                              uint8_t* pDst);
-
- private:
-  SImageRotateFuncs m_pfRotateImage;
-  int32_t          m_iCPUFlag;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/imagerotate/imagerotatefuncs.cpp
+++ /dev/null
@@ -1,66 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *  image_rotate.c
- *
- *  Created on 11-2-21.
- *
- */
-
-#include "imagerotate.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void ImageRotate90D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
-  for (uint32_t j = 0; j < iHeight; j++) {
-    for (uint32_t i = 0; i < iWidth; i++) {
-      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
-        pDst[ (i * iHeight + iHeight - 1 - j)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
-    }
-  }
-}
-void ImageRotate180D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
-  for (uint32_t j = 0; j < iHeight; j++) {
-    for (uint32_t i = 0; i < iWidth; i++) {
-      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
-        pDst[ ((iHeight - 1 - j)*iWidth + iWidth - 1 - i)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
-    }
-  }
-}
-void ImageRotate270D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
-  for (uint32_t j = 0; j < iWidth; j++) {
-    for (uint32_t i = 0; i < iHeight; i++) {
-      for (uint32_t n = 0; n < uiBytesPerPixel; n++)
-        pDst[ ((iWidth - 1 - j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[ (iWidth * i + j) * uiBytesPerPixel + n];
-    }
-  }
-}
-WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ /dev/null
@@ -1,136 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "SceneChangeDetection.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define HIGH_MOTION_BLOCK_THRESHOLD 320
-#define SCENE_CHANGE_MOTION_RATIO	0.85f
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CSceneChangeDetection::CSceneChangeDetection (int32_t iCpuFlag) {
-  m_iCpuFlag = iCpuFlag;
-  m_eMethod   = METHOD_SCENE_CHANGE_DETECTION;
-  m_pfSad   = NULL;
-  WelsMemset (&m_sSceneChangeParam, 0, sizeof (m_sSceneChangeParam));
-  InitSadFuncs (m_pfSad, m_iCpuFlag);
-}
-
-CSceneChangeDetection::~CSceneChangeDetection() {
-}
-
-EResult CSceneChangeDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  EResult eReturn = RET_INVALIDPARAM;
-
-  int32_t iWidth                  = pSrcPixMap->sRect.iRectWidth;
-  int32_t iHeight                 = pSrcPixMap->sRect.iRectHeight;
-  int32_t iBlock8x8Width      = iWidth  >> 3;
-  int32_t iBlock8x8Height	 = iHeight >> 3;
-  int32_t iBlock8x8Num       = iBlock8x8Width * iBlock8x8Height;
-  int32_t iSceneChangeThreshold = WelsStaticCast (int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
-
-  int32_t iBlockSad = 0;
-  int32_t iMotionBlockNum = 0;
-
-  uint8_t* pRefY = NULL, *pCurY = NULL;
-  int32_t iRefStride = 0, iCurStride = 0;
-  int32_t iRefRowStride = 0, iCurRowStride = 0;
-
-  uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
-
-  pRefY = (uint8_t*)pRefPixMap->pPixel[0];
-  pCurY = (uint8_t*)pSrcPixMap->pPixel[0];
-
-  iRefStride  = pRefPixMap->iStride[0];
-  iCurStride  = pSrcPixMap->iStride[0];
-
-  iRefRowStride  = pRefPixMap->iStride[0] << 3;
-  iCurRowStride  = pSrcPixMap->iStride[0] << 3;
-
-  m_sSceneChangeParam.bSceneChangeFlag = 0;
-
-  for (int32_t j = 0; j < iBlock8x8Height; j ++) {
-    pRefTmp	= pRefY;
-    pCurTmp 	= pCurY;
-
-    for (int32_t i = 0; i < iBlock8x8Width; i++) {
-      iBlockSad = m_pfSad (pRefTmp, iRefStride, pCurTmp, iCurStride);
-
-      iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
-
-      pRefTmp += 8;
-      pCurTmp += 8;
-    }
-
-    pRefY += iRefRowStride;
-    pCurY += iCurRowStride;
-  }
-
-  if (iMotionBlockNum >= iSceneChangeThreshold) {
-    m_sSceneChangeParam.bSceneChangeFlag = 1;
-  }
-
-  eReturn = RET_SUCCESS;
-
-  return eReturn;
-}
-
-
-EResult CSceneChangeDetection::Get (int32_t iType, void* pParam) {
-  if (pParam == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  * (SSceneChangeResult*)pParam = m_sSceneChangeParam;
-
-  return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad,  int32_t iCpuFlag) {
-  pfSad = WelsSampleSad8x8_c;
-
-#ifdef X86_ASM
-  if (iCpuFlag & WELS_CPU_SSE2) {
-    pfSad = WelsSampleSad8x8_sse21;
-  }
-#endif
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetection.h
+++ /dev/null
@@ -1,72 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
-* \file	        :  SceneChangeDetection.h
-*
-* \brief	    :  scene change detection class of wels video processor class
-*
-* \date         :  2011/03/14
-*
-* \description  :  1. rewrite the package code of scene change detection class
-*
-*************************************************************************************
-*/
-
-#ifndef WELSVP_SCENECHANGEDETECTION_H
-#define WELSVP_SCENECHANGEDETECTION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-#include "SceneChangeDetectionCommon.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-class CSceneChangeDetection : public IStrategy {
- public:
-  CSceneChangeDetection (int32_t iCpuFlag);
-  ~CSceneChangeDetection();
-
-  EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
-  EResult Get (int32_t iType, void* pParam);
-
- private:
-  void InitSadFuncs (SadFuncPtr& pfSadFunc, int32_t iCpuFlag);
-
- private:
-  SadFuncPtr m_pfSad;
-  int32_t    m_iCpuFlag;
-  SSceneChangeResult m_sSceneChangeParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
+++ /dev/null
@@ -1,60 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "SceneChangeDetectionCommon.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-int32_t WelsSampleSad8x8_c (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY) {
-  int32_t iSadSum = 0;
-  uint8_t* pSrcA = pSrcY;
-  uint8_t* pSrcB = pRefY;
-  for (int32_t i = 0; i < 8; i++) {
-    iSadSum += WELS_ABS ((pSrcA[0] - pSrcB[0]));
-    iSadSum += WELS_ABS ((pSrcA[1] - pSrcB[1]));
-    iSadSum += WELS_ABS ((pSrcA[2] - pSrcB[2]));
-    iSadSum += WELS_ABS ((pSrcA[3] - pSrcB[3]));
-    iSadSum += WELS_ABS ((pSrcA[4] - pSrcB[4]));
-    iSadSum += WELS_ABS ((pSrcA[5] - pSrcB[5]));
-    iSadSum += WELS_ABS ((pSrcA[6] - pSrcB[6]));
-    iSadSum += WELS_ABS ((pSrcA[7] - pSrcB[7]));
-
-    pSrcA += iSrcStrideY;
-    pSrcB += iRefStrideY;
-  }
-
-  return iSadSum;
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ /dev/null
@@ -1,65 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	        :  SceneChangeDetectionCommon.h
- *
- * \brief	    :  scene change detection class of wels video processor class
- *
- * \date         :  2011/03/14
- *
- * \description  :  1. rewrite the package code of scene change detection class
- *
- */
-
-#ifndef WELSVP_SCENECHANGEDETECTIONCOMMON_H
-#define WELSVP_SCENECHANGEDETECTIONCOMMON_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef  int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
-
-typedef SadFunc*   SadFuncPtr;
-
-SadFunc      WelsSampleSad8x8_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-SadFunc      WelsSampleSad8x8_sse21;
-WELSVP_EXTERN_C_END
-#endif
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/vaacalc/vaacalcfuncs.cpp
+++ /dev/null
@@ -1,595 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "../common/typedef.h"
-#include "../common/util.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void VAACalcSadSsd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16) {
-  uint8_t* tmp_ref = pRefData;
-  uint8_t* tmp_cur = pCurData;
-  int32_t iMbWidth = (iPicWidth >> 4);
-  int32_t mb_heigth = (iPicHeight >> 4);
-  int32_t mb_index = 0;
-  int32_t pic_stride_x8 = iPicStride << 3;
-  int32_t step = (iPicStride << 4) - iPicWidth;
-
-  *pFrameSad = 0;
-  for (int32_t i = 0; i < mb_heigth; i ++) {
-    for (int32_t j = 0; j < iMbWidth; j ++) {
-      int32_t k, l;
-      int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
-      uint8_t* tmp_cur_row;
-      uint8_t* tmp_ref_row;
-
-      pSum16x16[mb_index] = 0;
-      psqsum16x16[mb_index] = 0;
-      psqdiff16x16[mb_index] = 0;
-
-      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur;
-      tmp_ref_row = tmp_ref;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sqdiff += diff * diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 0] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-
-      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + 8;
-      tmp_ref_row = tmp_ref + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sqdiff += diff * diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 1] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-
-      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + pic_stride_x8;
-      tmp_ref_row = tmp_ref + pic_stride_x8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sqdiff += diff * diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 2] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-
-      l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sqdiff += diff * diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 3] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-
-
-      tmp_ref += 16;
-      tmp_cur += 16;
-      ++mb_index;
-    }
-    tmp_ref += step;
-    tmp_cur += step;
-  }
-}
-void VAACalcSadVar_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16) {
-  uint8_t* tmp_ref = pRefData;
-  uint8_t* tmp_cur = pCurData;
-  int32_t iMbWidth = (iPicWidth >> 4);
-  int32_t mb_heigth = (iPicHeight >> 4);
-  int32_t mb_index = 0;
-  int32_t pic_stride_x8 = iPicStride << 3;
-  int32_t step = (iPicStride << 4) - iPicWidth;
-
-  *pFrameSad = 0;
-  for (int32_t i = 0; i < mb_heigth; i ++) {
-    for (int32_t j = 0; j < iMbWidth; j ++) {
-      int32_t k, l;
-      int32_t l_sad, l_sum, l_sqsum;
-      uint8_t* tmp_cur_row;
-      uint8_t* tmp_ref_row;
-
-      pSum16x16[mb_index] = 0;
-      psqsum16x16[mb_index] = 0;
-
-      l_sad =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur;
-      tmp_ref_row = tmp_ref;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 0] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-
-      l_sad =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + 8;
-      tmp_ref_row = tmp_ref + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 1] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-
-      l_sad =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + pic_stride_x8;
-      tmp_ref_row = tmp_ref + pic_stride_x8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 2] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-
-      l_sad =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 3] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-
-
-      tmp_ref += 16;
-      tmp_cur += 16;
-      ++mb_index;
-    }
-    tmp_ref += step;
-    tmp_cur += step;
-  }
-}
-
-
-void VAACalcSad_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-                   int32_t* pFrameSad, int32_t* pSad8x8) {
-  uint8_t* tmp_ref = pRefData;
-  uint8_t* tmp_cur = pCurData;
-  int32_t iMbWidth = (iPicWidth >> 4);
-  int32_t mb_heigth = (iPicHeight >> 4);
-  int32_t mb_index = 0;
-  int32_t pic_stride_x8 = iPicStride << 3;
-  int32_t step = (iPicStride << 4) - iPicWidth;
-
-  *pFrameSad = 0;
-  for (int32_t i = 0; i < mb_heigth; i ++) {
-    for (int32_t j = 0; j < iMbWidth; j ++) {
-      int32_t k, l;
-      int32_t l_sad;
-      uint8_t* tmp_cur_row;
-      uint8_t* tmp_ref_row;
-
-      l_sad =  0;
-      tmp_cur_row = tmp_cur;
-      tmp_ref_row = tmp_ref;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 0] = l_sad;
-
-      l_sad =  0;
-      tmp_cur_row = tmp_cur + 8;
-      tmp_ref_row = tmp_ref + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 1] = l_sad;
-
-      l_sad =  0;
-      tmp_cur_row = tmp_cur + pic_stride_x8;
-      tmp_ref_row = tmp_ref + pic_stride_x8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 2] = l_sad;
-
-      l_sad =  0;
-      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
-          l_sad += diff;
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 3] = l_sad;
-
-      tmp_ref += 16;
-      tmp_cur += 16;
-      ++mb_index;
-    }
-    tmp_ref += step;
-    tmp_cur += step;
-  }
-}
-
-void VAACalcSadSsdBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
-                         int32_t iPicStride,
-                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16, int32_t* pSd8x8,
-                         uint8_t* pMad8x8)
-
-{
-  uint8_t* tmp_ref = pRefData;
-  uint8_t* tmp_cur = pCurData;
-  int32_t iMbWidth = (iPicWidth >> 4);
-  int32_t mb_heigth = (iPicHeight >> 4);
-  int32_t mb_index = 0;
-  int32_t pic_stride_x8 = iPicStride << 3;
-  int32_t step = (iPicStride << 4) - iPicWidth;
-
-  *pFrameSad = 0;
-  for (int32_t i = 0; i < mb_heigth; i ++) {
-    for (int32_t j = 0; j < iMbWidth; j ++) {
-      int32_t k, l;
-      int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
-      uint8_t* tmp_cur_row;
-      uint8_t* tmp_ref_row;
-
-      pSum16x16[mb_index] = 0;
-      psqsum16x16[mb_index] = 0;
-      psqdiff16x16[mb_index] = 0;
-
-      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur;
-      tmp_ref_row = tmp_ref;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-
-          l_sd += diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-          l_sad += abs_diff;
-          l_sqdiff += abs_diff * abs_diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 0] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-      pSd8x8[ (mb_index << 2) + 0] = l_sd;
-      pMad8x8[ (mb_index << 2) + 0] = l_mad;
-
-
-      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + 8;
-      tmp_ref_row = tmp_ref + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-
-          l_sd += diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-          l_sad += abs_diff;
-          l_sqdiff += abs_diff * abs_diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 1] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-      pSd8x8[ (mb_index << 2) + 1] = l_sd;
-      pMad8x8[ (mb_index << 2) + 1] = l_mad;
-
-      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + pic_stride_x8;
-      tmp_ref_row = tmp_ref + pic_stride_x8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-
-          l_sd += diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-          l_sad += abs_diff;
-          l_sqdiff += abs_diff * abs_diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 2] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-      pSd8x8[ (mb_index << 2) + 2] = l_sd;
-      pMad8x8[ (mb_index << 2) + 2] = l_mad;
-
-      l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
-      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-
-          l_sd += diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-          l_sad += abs_diff;
-          l_sqdiff += abs_diff * abs_diff;
-          l_sum += tmp_cur_row[l];
-          l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 3] = l_sad;
-      pSum16x16[mb_index] += l_sum;
-      psqsum16x16[mb_index] += l_sqsum;
-      psqdiff16x16[mb_index] += l_sqdiff;
-      pSd8x8[ (mb_index << 2) + 3] = l_sd;
-      pMad8x8[ (mb_index << 2) + 3] = l_mad;
-
-      tmp_ref += 16;
-      tmp_cur += 16;
-      ++mb_index;
-    }
-    tmp_ref += step;
-    tmp_cur += step;
-  }
-}
-
-void VAACalcSadBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
-                      int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8) {
-  uint8_t* tmp_ref = pRefData;
-  uint8_t* tmp_cur = pCurData;
-  int32_t iMbWidth = (iPicWidth >> 4);
-  int32_t mb_heigth = (iPicHeight >> 4);
-  int32_t mb_index = 0;
-  int32_t pic_stride_x8 = iPicStride << 3;
-  int32_t step = (iPicStride << 4) - iPicWidth;
-
-  *pFrameSad = 0;
-  for (int32_t i = 0; i < mb_heigth; i ++) {
-    for (int32_t j = 0; j < iMbWidth; j ++) {
-      int32_t k, l;
-      int32_t l_sad, l_sd, l_mad;
-      uint8_t* tmp_cur_row;
-      uint8_t* tmp_ref_row;
-
-      l_mad = l_sd = l_sad =  0;
-      tmp_cur_row = tmp_cur;
-      tmp_ref_row = tmp_ref;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-          l_sd += diff;
-          l_sad += abs_diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 0] = l_sad;
-      pSd8x8[ (mb_index << 2) + 0] = l_sd;
-      pMad8x8[ (mb_index << 2) + 0] = l_mad;
-
-      l_mad = l_sd = l_sad =  0;
-      tmp_cur_row = tmp_cur + 8;
-      tmp_ref_row = tmp_ref + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-          l_sd += diff;
-          l_sad += abs_diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 1] = l_sad;
-      pSd8x8[ (mb_index << 2) + 1] = l_sd;
-      pMad8x8[ (mb_index << 2) + 1] = l_mad;
-
-      l_mad = l_sd = l_sad =  0;
-      tmp_cur_row = tmp_cur + pic_stride_x8;
-      tmp_ref_row = tmp_ref + pic_stride_x8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-          l_sd += diff;
-          l_sad += abs_diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 2] = l_sad;
-      pSd8x8[ (mb_index << 2) + 2] = l_sd;
-      pMad8x8[ (mb_index << 2) + 2] = l_mad;
-
-      l_mad = l_sd = l_sad =  0;
-      tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
-      tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
-      for (k = 0; k < 8; k ++) {
-        for (l = 0; l < 8; l ++) {
-          int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
-          int32_t abs_diff = WELS_ABS (diff);
-          l_sd += diff;
-          l_sad += abs_diff;
-          if (abs_diff > l_mad) {
-            l_mad = abs_diff;
-          }
-        }
-        tmp_cur_row += iPicStride;
-        tmp_ref_row += iPicStride;
-      }
-      *pFrameSad += l_sad;
-      pSad8x8[ (mb_index << 2) + 3] = l_sad;
-      pSd8x8[ (mb_index << 2) + 3] = l_sd;
-      pMad8x8[ (mb_index << 2) + 3] = l_mad;
-
-      tmp_ref += 16;
-      tmp_cur += 16;
-      ++mb_index;
-    }
-    tmp_ref += step;
-    tmp_cur += step;
-  }
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/vaacalc/vaacalculation.cpp
+++ /dev/null
@@ -1,123 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "vaacalculation.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CVAACalculation::CVAACalculation (int32_t iCpuFlag) {
-  m_iCPUFlag = iCpuFlag;
-  m_eMethod   = METHOD_VAA_STATISTICS;
-
-  WelsMemset (&m_sCalcParam, 0, sizeof (m_sCalcParam));
-  WelsMemset (&m_sVaaFuncs, 0, sizeof (m_sVaaFuncs));
-  InitVaaFuncs (m_sVaaFuncs, m_iCPUFlag);
-}
-
-CVAACalculation::~CVAACalculation() {
-}
-
-void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
-  sVaaFuncs.pfVAACalcSad				= VAACalcSad_c;
-  sVaaFuncs.pfVAACalcSadBgd			= VAACalcSadBgd_c;
-  sVaaFuncs.pfVAACalcSadSsd			= VAACalcSadSsd_c;
-  sVaaFuncs.pfVAACalcSadSsdBgd		= VAACalcSadSsdBgd_c;
-  sVaaFuncs.pfVAACalcSadVar			= VAACalcSadVar_c;
-#ifdef X86_ASM
-  if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
-    sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
-    sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
-    sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
-    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
-    sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;
-  }
-#endif//X86_ASM
-}
-
-EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
-  uint8_t* pCurData	= (uint8_t*)pSrcPixMap->pPixel[0];
-  uint8_t* pRefData	= (uint8_t*)pRefPixMap->pPixel[0];
-  int32_t iPicWidth	= pSrcPixMap->sRect.iRectWidth;
-  int32_t iPicHeight	= pSrcPixMap->sRect.iRectHeight;
-  int32_t iPicStride	= pSrcPixMap->iStride[0];
-
-  SVAACalcResult* pResult = m_sCalcParam.pCalcResult;
-
-  if (pCurData == NULL || pRefData == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  pResult->pCurY = pCurData;
-  pResult->pRefY = pRefData;
-  if (m_sCalcParam.iCalcBgd) {
-    if (m_sCalcParam.iCalcSsd) {
-      m_sVaaFuncs.pfVAACalcSadSsdBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-                                      (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16,
-                                      (int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
-    } else {
-      m_sVaaFuncs.pfVAACalcSadBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-                                   (int32_t*) (pResult->pSad8x8), (int32_t*) (pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
-    }
-  } else {
-    if (m_sCalcParam.iCalcSsd) {
-      m_sVaaFuncs.pfVAACalcSadSsd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-                                   (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
-    } else {
-      if (m_sCalcParam.iCalcVar) {
-        m_sVaaFuncs.pfVAACalcSadVar (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-                                     (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
-      } else {
-        m_sVaaFuncs.pfVAACalcSad (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
-                                  (int32_t*)pResult->pSad8x8);
-      }
-    }
-  }
-
-  return RET_SUCCESS;
-}
-
-EResult CVAACalculation::Set (int32_t iType, void* pParam) {
-  if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL) {
-    return RET_INVALIDPARAM;
-  }
-
-  m_sCalcParam = * (SVAACalcParam*)pParam;
-
-  return RET_SUCCESS;
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/vaacalc/vaacalculation.h
+++ /dev/null
@@ -1,125 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2011-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	    :  vaacalculation.h
- *
- * \brief	    :  pVaa calculation class of wels video processor class
- *
- * \date        :  2011/03/18
- *
- * \description :  1. rewrite the package code of pVaa calculation class
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_VAACALCULATION_H
-#define WELSVP_VAACALCULATION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (VAACalcSadBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
-                                  int32_t iPicStride,
-                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8);
-
-typedef void (VAACalcSadSsdBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
-                                     int32_t iPicStride,
-                                     int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16,
-                                     int32_t* pSsd16x16, int32_t* pSd8x8, uint8_t* pMad8x8);
-
-typedef void (VAACalcSadFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
-                               int32_t iPicStride,
-                               int32_t* pFrameSad, int32_t* pSad8x8);
-
-typedef void (VAACalcSadVarFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
-                                  int32_t iPicStride,
-                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16);
-
-typedef void (VAACalcSadSsdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
-                                  int32_t iPicStride,
-                                  int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16, int32_t* pSsd16x16);
-
-
-typedef VAACalcSadBgdFunc*		 PVAACalcSadBgdFunc;
-typedef VAACalcSadSsdBgdFunc*	 PVAACalcSadSsdBgdFunc;
-typedef VAACalcSadFunc*			 PVAACalcSadFunc;
-typedef VAACalcSadVarFunc*		 PVAACalcSadVarFunc;
-typedef VAACalcSadSsdFunc*		 PVAACalcSadSsdFunc;
-
-typedef  struct TagVaaFuncs {
-  PVAACalcSadBgdFunc		pfVAACalcSadBgd;
-  PVAACalcSadSsdBgdFunc	pfVAACalcSadSsdBgd;
-  PVAACalcSadFunc			pfVAACalcSad;
-  PVAACalcSadVarFunc		pfVAACalcSadVar;
-  PVAACalcSadSsdFunc		pfVAACalcSadSsd;
-} SVaaFuncs;
-
-
-VAACalcSadBgdFunc		VAACalcSadBgd_c;
-VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_c;
-VAACalcSadFunc			    VAACalcSad_c;
-VAACalcSadVarFunc		VAACalcSadVar_c;
-VAACalcSadSsdFunc		VAACalcSadSsd_c;
-
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-VAACalcSadBgdFunc		VAACalcSadBgd_sse2;
-VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_sse2;
-VAACalcSadFunc			    VAACalcSad_sse2;
-VAACalcSadVarFunc		VAACalcSadVar_sse2;
-VAACalcSadSsdFunc		VAACalcSadSsd_sse2;
-WELSVP_EXTERN_C_END
-#endif
-
-class CVAACalculation : public IStrategy {
- public:
-  CVAACalculation (int32_t iCpuFlag);
-  ~CVAACalculation();
-
-  EResult Process (int32_t iType, SPixMap* pCurPixMap, SPixMap* pRefPixMap);
-  EResult Set (int32_t iType, void* pParam);
-
- private:
-  void InitVaaFuncs (SVaaFuncs& sVaaFunc, int32_t iCpuFlag);
-
- private:
-  SVaaFuncs      m_sVaaFuncs;
-  int32_t       m_iCPUFlag;
-  SVAACalcParam m_sCalcParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/targets.mk
+++ /dev/null
@@ -1,122 +1,0 @@
-PROCESSING_PREFIX=PROCESSING
-PROCESSING_SRCDIR=processing
-PROCESSING_CPP_SRCS=\
-	$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp\
-	$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/cpu.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/memory.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/thread.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/util.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp\
-	$(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp\
-	$(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp\
-	$(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp\
-	$(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp\
-	$(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp\
-	$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp\
-	$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp\
-	$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp\
-	$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp\
-	$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp\
-	$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp\
-
-PROCESSING_OBJS += $(PROCESSING_CPP_SRCS:.cpp=.o)
-ifeq ($(USE_ASM), Yes)
-PROCESSING_ASM_SRCS=\
-	$(PROCESSING_SRCDIR)/./src/asm/asm_inc.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/cpuid.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/sad.asm\
-	$(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
-
-PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)
-endif
-
-OBJS += $(PROCESSING_OBJS)
-$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o: $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
-
-$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o: $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/cpu.o: $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/cpu.o $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/memory.o: $(PROCESSING_SRCDIR)/./src/common/memory.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/memory.o $(PROCESSING_SRCDIR)/./src/common/memory.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/thread.o: $(PROCESSING_SRCDIR)/./src/common/thread.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/thread.o $(PROCESSING_SRCDIR)/./src/common/thread.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/util.o: $(PROCESSING_SRCDIR)/./src/common/util.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/util.o $(PROCESSING_SRCDIR)/./src/common/util.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
-
-$(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o: $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
-
-$(PROCESSING_SRCDIR)/./src/denoise/denoise.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise.o $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
-
-$(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
-
-$(PROCESSING_SRCDIR)/./src/downsample/downsample.o: $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsample.o $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
-
-$(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o: $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
-
-$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
-
-$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
-
-$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
-
-$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
-
-$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
-
-$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
-	$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
-
-$(PROCESSING_SRCDIR)/./src/asm/asm_inc.o: $(PROCESSING_SRCDIR)/./src/asm/asm_inc.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/asm_inc.o $(PROCESSING_SRCDIR)/./src/asm/asm_inc.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/cpuid.o: $(PROCESSING_SRCDIR)/./src/asm/cpuid.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/cpuid.o $(PROCESSING_SRCDIR)/./src/asm/cpuid.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o: $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o: $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/intra_pred.o: $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/sad.o: $(PROCESSING_SRCDIR)/./src/asm/sad.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/sad.o $(PROCESSING_SRCDIR)/./src/asm/sad.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/vaa.o: $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
-	$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/vaa.o $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
-
-$(LIBPREFIX)processing.$(LIBSUFFIX): $(PROCESSING_OBJS)
-	rm -f $(LIBPREFIX)processing.$(LIBSUFFIX)
-	$(AR) cr $@ $(PROCESSING_OBJS)
-
-libraries: $(LIBPREFIX)processing.$(LIBSUFFIX)
-LIBRARIES += $(LIBPREFIX)processing.$(LIBSUFFIX)