shithub: openh264

Download patch

ref: 5c9f447c0ea19870c48b27c2d1d43af2d42eb579
parent: ae027b83d8f90c0517a37927d28e3483e69747c4
author: volvet <qizh@cisco.com>
date: Tue Jan 21 06:16:48 EST 2014

fix win64 float issue, enable AQ assembly

--- a/codec/build/win32/enc/WelsEncPlus.vcproj
+++ b/codec/build/win32/enc/WelsEncPlus.vcproj
@@ -53,7 +53,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
-				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;"
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
@@ -118,9 +118,9 @@
 			/>
 		</Configuration>
 		<Configuration
-			Name="Release|Win32"
-			OutputDirectory=".\..\..\..\..\bin\win32\Release"
-			IntermediateDirectory=".\..\..\..\obj\encoder\plus\Release"
+			Name="Debug|x64"
+			OutputDirectory=".\..\..\..\..\bin\win64\Debug"
+			IntermediateDirectory=".\..\..\..\obj\encoder\plus\Debug"
 			ConfigurationType="2"
 			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
 			UseOfMFC="0"
@@ -141,29 +141,25 @@
 			/>
 			<Tool
 				Name="VCMIDLTool"
-				PreprocessorDefinitions="NDEBUG"
+				PreprocessorDefinitions="_DEBUG"
 				MkTypLibCompatible="true"
 				SuppressStartupBanner="true"
-				TargetEnvironment="1"
-				TypeLibraryName=".\..\..\..\..\..\bin\Release/WelsEncPlus.tlb"
+				TargetEnvironment="3"
+				TypeLibraryName=".\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb"
 				HeaderFileName=""
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
-				Optimization="3"
-				InlineFunctionExpansion="2"
-				FavorSizeOrSpeed="1"
-				EnableFiberSafeOptimizations="true"
-				WholeProgramOptimization="true"
+				Optimization="0"
 				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
-				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;"
-				StringPooling="true"
-				RuntimeLibrary="2"
-				EnableFunctionLevelLinking="true"
-				PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch"
-				AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Release/"
-				ObjectFile=".\..\..\..\obj\encoder\plus\Release/"
-				ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Release/"
+				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Debug/"
+				ObjectFile=".\..\..\..\obj\encoder\plus\Debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Debug/"
 				WarningLevel="3"
 				SuppressStartupBanner="true"
 				DebugInformationFormat="3"
@@ -173,7 +169,7 @@
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="NDEBUG"
+				PreprocessorDefinitions="_DEBUG"
 				Culture="1033"
 			/>
 			<Tool
@@ -181,22 +177,20 @@
 			/>
 			<Tool
 				Name="VCLinkerTool"
-				AdditionalOptions="/MAPINFO:exports /LTCG"
 				AdditionalDependencies="$(OutDir)\welsecore.lib"
 				OutputFile="$(OutDir)\welsenc.dll"
-				LinkIncremental="1"
+				LinkIncremental="2"
 				SuppressStartupBanner="true"
 				AdditionalLibraryDirectories="..\..\..\..\libs"
 				ModuleDefinitionFile="..\..\..\encoder\plus\src\wels_enc_export.def"
 				GenerateDebugInformation="true"
 				ProgramDatabaseFile="$(OutDir)\welsenc.pdb"
-				GenerateMapFile="false"
-				MapFileName=""
-				MapExports="false"
+				GenerateMapFile="true"
+				MapFileName="$(OutDir)\welsenc.map"
 				RandomizedBaseAddress="1"
 				DataExecutionPrevention="2"
 				ImportLibrary="$(OutDir)\welsenc.lib"
-				TargetMachine="1"
+				TargetMachine="17"
 			/>
 			<Tool
 				Name="VCALinkTool"
@@ -223,9 +217,9 @@
 			/>
 		</Configuration>
 		<Configuration
-			Name="Debug|x64"
-			OutputDirectory=".\..\..\..\..\bin\win64\Debug"
-			IntermediateDirectory=".\..\..\..\obj\encoder\plus\Debug"
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\encoder\plus\Release"
 			ConfigurationType="2"
 			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
 			UseOfMFC="0"
@@ -246,25 +240,29 @@
 			/>
 			<Tool
 				Name="VCMIDLTool"
-				PreprocessorDefinitions="_DEBUG"
+				PreprocessorDefinitions="NDEBUG"
 				MkTypLibCompatible="true"
 				SuppressStartupBanner="true"
-				TargetEnvironment="3"
-				TypeLibraryName=".\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb"
+				TargetEnvironment="1"
+				TypeLibraryName=".\..\..\..\..\..\bin\Release/WelsEncPlus.tlb"
 				HeaderFileName=""
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
-				Optimization="0"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				FavorSizeOrSpeed="1"
+				EnableFiberSafeOptimizations="true"
+				WholeProgramOptimization="true"
 				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
-				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED"
-				MinimalRebuild="true"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="3"
-				PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch"
-				AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Debug/"
-				ObjectFile=".\..\..\..\obj\encoder\plus\Debug/"
-				ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Debug/"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Release/"
+				ObjectFile=".\..\..\..\obj\encoder\plus\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Release/"
 				WarningLevel="3"
 				SuppressStartupBanner="true"
 				DebugInformationFormat="3"
@@ -274,7 +272,7 @@
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="_DEBUG"
+				PreprocessorDefinitions="NDEBUG"
 				Culture="1033"
 			/>
 			<Tool
@@ -282,20 +280,22 @@
 			/>
 			<Tool
 				Name="VCLinkerTool"
+				AdditionalOptions="/MAPINFO:exports /LTCG"
 				AdditionalDependencies="$(OutDir)\welsecore.lib"
 				OutputFile="$(OutDir)\welsenc.dll"
-				LinkIncremental="2"
+				LinkIncremental="1"
 				SuppressStartupBanner="true"
 				AdditionalLibraryDirectories="..\..\..\..\libs"
 				ModuleDefinitionFile="..\..\..\encoder\plus\src\wels_enc_export.def"
 				GenerateDebugInformation="true"
 				ProgramDatabaseFile="$(OutDir)\welsenc.pdb"
-				GenerateMapFile="true"
-				MapFileName="$(OutDir)\welsenc.map"
+				GenerateMapFile="false"
+				MapFileName=""
+				MapExports="false"
 				RandomizedBaseAddress="1"
 				DataExecutionPrevention="2"
 				ImportLibrary="$(OutDir)\welsenc.lib"
-				TargetMachine="17"
+				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
@@ -360,7 +360,7 @@
 				EnableFiberSafeOptimizations="true"
 				WholeProgramOptimization="true"
 				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
-				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;"
+				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="true"
@@ -447,7 +447,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -456,7 +456,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -491,7 +491,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -500,7 +500,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -531,7 +531,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -540,7 +540,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@@ -589,7 +589,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|Win32"
+					Name="Debug|x64"
 					>
 					<Tool
 						Name="VCResourceCompilerTool"
@@ -598,7 +598,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Debug|x64"
+					Name="Release|Win32"
 					>
 					<Tool
 						Name="VCResourceCompilerTool"
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -204,6 +204,9 @@
   }
 }
 
+void WelsXmmRegEmptyOp(void * pSrc) {
+}
+
 #endif
 
 
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@@ -41,9 +41,9 @@
 #define WELS_CPU_DETECTION_H__
 
 #include "typedefs.h"
+#include "cpu_core.h"
 
 
-
 #if defined(__cplusplus)
 extern "C" {
 #endif//__cplusplus
@@ -69,12 +69,56 @@
  */
 void     WelsCPURestore (const uint32_t kuiCPU);
 
+#ifdef  WIN64
+void     WelsXmmRegStore(void * src);
+void     WelsXmmRegLoad(void * src);
 #endif
 
+#endif
+
+void     WelsXmmRegEmptyOp(void * pSrc);
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
 
+typedef  void (*WelsXmmRegProtectFunc)(void * pSrc);
 
+
+#ifdef  WIN64
+#define   XMMREG_PROTECT_DECLARE(name) \
+  WelsXmmRegProtectFunc name##load;\
+  WelsXmmRegProtectFunc name##store;\
+  uint8_t               name##Buffer[160];
+
+#define   XMMREG_PROTECT_INIT(name) \
+  { \
+    uint32_t uiCpuFlag = WelsCPUFeatureDetect(NULL);\
+    if( uiCpuFlag & WELS_CPU_SSE2 ){\
+      name##load = WelsXmmRegLoad;\
+      name##store = WelsXmmRegStore; \
+    } else { \
+      name##load = WelsXmmRegEmptyOp; \
+      name##store = WelsXmmRegEmptyOp; \
+    } \
+  }
+
+#define   XMMREG_PROTECT_UNINIT(name) \
+
+#define   XMMREG_PROTECT_STORE(name) \
+  name##store(name##Buffer);
+
+#define   XMMREG_PROTECT_LOAD(name) \
+  name##load(name##Buffer);
+
+#else
+
+#define   XMMREG_PROTECT_DECLARE(name)
+#define   XMMREG_PROTECT_INIT(name)
+#define   XMMREG_PROTECT_UNINIT(name)
+#define   XMMREG_PROTECT_STORE(name)
+#define   XMMREG_PROTECT_LOAD(name)
+
+#endif
 
 #endif//WELS_CPU_DETECTION_H__
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -221,4 +221,43 @@
 	ret
 
 
+%ifdef     WIN64
+
+WELS_EXTERN WelsXmmRegStore
+ALIGN 16
+;******************************************************************************************
+;   void WelsXmmRegStore(void *src)
+;******************************************************************************************
+WelsXmmRegStore:
+  movdqu [rcx], xmm6
+  movdqu [rcx+16], xmm7
+  movdqu [rcx+32], xmm8
+  movdqu [rcx+48], xmm9
+  movdqu [rcx+64], xmm10
+  movdqu [rcx+80], xmm11
+  movdqu [rcx+96], xmm12
+  movdqu [rcx+112], xmm13
+  movdqu [rcx+128], xmm14
+  movdqu [rcx+144], xmm15
+  ret
+
+WELS_EXTERN WelsXmmRegLoad
+ALIGN 16
+;******************************************************************************************
+;   void WelsXmmRegLoad(void *src)
+;******************************************************************************************
+WelsXmmRegLoad:
+  movdqu xmm6, [rcx]
+  movdqu xmm7, [rcx+16]
+  movdqu xmm8, [rcx+32]
+  movdqu xmm9, [rcx+48]
+  movdqu xmm10, [rcx+64]
+  movdqu xmm11, [rcx+80]
+  movdqu xmm12, [rcx+96]
+  movdqu xmm13, [rcx+112]
+  movdqu xmm14, [rcx+128]
+  movdqu xmm15, [rcx+144]
+  ret
+%endif
+
 
--- a/codec/encoder/plus/inc/welsEncoderExt.h
+++ b/codec/encoder/plus/inc/welsEncoderExt.h
@@ -49,6 +49,7 @@
 #include "encoder_context.h"
 #include "param_svc.h"
 #include "extern.h"
+#include "cpu.h"
 
 //#define OUTPUT_BIT_STREAM
 //#define DUMP_SRC_PICTURE
@@ -129,6 +130,8 @@
   void    InitEncoder (void);
   int32_t RawData2SrcPic (const uint8_t* pSrc);
   void    DumpSrcPicture (const uint8_t* pSrc);
+
+  XMMREG_PROTECT_DECLARE(CWelsH264SVCEncoder);
 };
 }
 #endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- a/codec/encoder/plus/src/welsEncoderExt.cpp
+++ b/codec/encoder/plus/src/welsEncoderExt.cpp
@@ -218,6 +218,7 @@
 #endif//OUTPUT_BIT_STREAM
 
   InitEncoder();
+  XMMREG_PROTECT_INIT(CWelsH264SVCEncoder);
 }
 
 CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
@@ -253,6 +254,7 @@
 #endif//OUTPUT_BIT_STREAM
 
   Uninitialize();
+  XMMREG_PROTECT_UNINIT(CWelsH264SVCEncoder);
 }
 
 void CWelsH264SVCEncoder::InitEncoder (void) {
@@ -628,7 +630,9 @@
   int32_t iFrameType = videoFrameTypeInvalid;
 
   if (nSrcPicNum > 0) {
+    XMMREG_PROTECT_STORE(CWelsH264SVCEncoder);
     iFrameTypeReturned = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPicList, nSrcPicNum);
+    XMMREG_PROTECT_LOAD(CWelsH264SVCEncoder);
   } else {
     assert (0);
     return videoFrameTypeInvalid;
binary files a/codec/processing/build/win32/WelsVP_2008.suo b/codec/processing/build/win32/WelsVP_2008.suo differ
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -137,7 +137,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories=""
+				AdditionalIncludeDirectories="../../../common/"
 				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -313,6 +313,7 @@
 				Optimization="3"
 				EnableIntrinsicFunctions="false"
 				FavorSizeOrSpeed="1"
+				AdditionalIncludeDirectories="../../../common/"
 				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="false"
@@ -378,7 +379,7 @@
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
 			>
 			<File
-				RelativePath="..\..\src\common\cpu.cpp"
+				RelativePath="..\..\..\common\cpu.cpp"
 				>
 			</File>
 			<File
@@ -496,7 +497,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -514,7 +515,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -545,7 +546,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -576,7 +577,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -594,7 +595,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -607,7 +608,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -616,7 +617,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -625,7 +626,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -634,7 +635,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -647,7 +648,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -656,7 +657,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -665,7 +666,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -674,7 +675,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -696,7 +697,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -714,7 +715,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -50,9 +50,11 @@
   m_pfVar   = NULL;
   WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
   WelsInitVarFunc (m_pfVar, m_CPUFlag);
+  XMMREG_PROTECT_INIT(AdaptiveQuantization);
 }
 
 CAdaptiveQuantization::~CAdaptiveQuantization() {
+  XMMREG_PROTECT_UNINIT(AdaptiveQuantization);
 }
 
 EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
@@ -101,6 +103,7 @@
       pRefFrameTmp  = pRefFrameY;
       pCurFrameTmp  = pCurFrameY;
       for (i = 0; i < iMbWidth; i++) {
+        XMMREG_PROTECT_STORE(AdaptiveQuantization);
         iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
         iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
         iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
@@ -109,6 +112,7 @@
         iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
         uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
         iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
+        XMMREG_PROTECT_LOAD(AdaptiveQuantization);
 
         iSumDiff = iSumDiff >> 8;
         pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
@@ -131,7 +135,9 @@
       pRefFrameTmp  = pRefFrameY;
       pCurFrameTmp  = pCurFrameY;
       for (i = 0; i < iMbWidth; i++) {
+        XMMREG_PROTECT_STORE(AdaptiveQuantization);
         m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
+        XMMREG_PROTECT_LOAD(AdaptiveQuantization);
         dAverageMotionIndex += pMotionTexture->uiMotionIndex;
         dAverageTextureIndex += pMotionTexture->uiTextureIndex;
         pMotionTexture++;
@@ -223,7 +229,7 @@
 
 #ifdef X86_ASM
   if (iCpuFlag & WELS_CPU_SSE2) {
-    // pfVar = SampleVariance16x16_sse2;
+    pfVar = SampleVariance16x16_sse2;
   }
 #endif
 }
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -45,6 +45,7 @@
 #include "../common/memory.h"
 #include "../common/WelsFrameWork.h"
 #include "../../interface/IWelsVP.h"
+#include "cpu.h"
 
 WELSVP_NAMESPACE_BEGIN
 
@@ -78,6 +79,7 @@
   PVarFunc			                   m_pfVar;
   int32_t                                  m_CPUFlag;
   SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
+  XMMREG_PROTECT_DECLARE(AdaptiveQuantization);
 };
 
 WELSVP_NAMESPACE_END
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -29,291 +29,211 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	vaa.asm
+;*      vaa.asm
 ;*
-;*	Abstract
+;*      Abstract
 ;*      sse2 for pVaa routines
 ;*
 ;*  History
-;*      04/14/2010	Created
+;*      04/14/2010      Created
+;*              06/07/2010      Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*              06/10/2010      Tune rc_sad_frame_sse2 and got about 40% improvement
+;*              08/11/2010      Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
-%ifdef X86_32
+
+
 ;***********************************************************************
 ; Macros and other preprocessor constants
 ;***********************************************************************
+%macro SUM_SQR_SSE2     3       ; dst, pSrc, zero
+  movdqa %1, %2
+  punpcklbw %1, %3
+  punpckhbw %2, %3
+  pmaddwd %1, %1
+  pmaddwd %2, %2
+  paddd %1, %2
+  pshufd %2, %1, 04Eh   ; 01001110 B
+  paddd %1, %2
+  pshufd %2, %1, 0B1h   ; 10110001 B
+  paddd %1, %2
+%endmacro       ; END OF SUM_SQR_SSE2
 
-;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
-;	movdqa %1, %2
-;	punpcklbw %1, %3
-;	punpckhbw %2, %3
-;	paddw %1, %2
-;	pmaddwd %1, %4
-;	pshufd %2, %1, 04Eh	; 01001110 B
-;	paddd %1, %2
-;	pshufd %2, %1, 0B1h	; 10110001 B
-;	paddd %1, %2
-;%endmacro	; END OF SUM_SSE2
+%macro WELS_SAD_16x2_SSE2  3 ;esi :%1 edi:%2 ebx:%3
+  movdqa        xmm1,   [%1]
+  movdqa        xmm2,   [%2]
+  movdqa        xmm3,   [%1+%3]
+  movdqa        xmm4,   [%2+%3]
+  psadbw        xmm1,   xmm2
+  psadbw        xmm3,   xmm4
+  paddd xmm6,   xmm1
+  paddd xmm6,   xmm3
+  lea           %1,     [%1+%3*2]
+  lea           %2,     [%2+%3*2]
+%endmacro
 
 ; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
+%macro SUM_WORD_8x2_SSE2        2       ; dst(pSrc), tmp
+  ; @sum_8x2 begin
+  pshufd %2, %1, 04Eh   ; 01001110 B
+  paddw %1, %2
+  pshuflw %2, %1, 04Eh  ; 01001110 B
+  paddw %1, %2
+  pshuflw %2, %1, 0B1h  ; 10110001 B
+  paddw %1, %2
+  ; end of @sum_8x2
+%endmacro       ; END of SUM_WORD_8x2_SSE2
 
-%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
-	movdqa %1, %2
-	punpcklbw %1, %3
-	punpckhbw %2, %3
-	pmaddwd %1, %1
-	pmaddwd %2, %2
-	paddd %1, %2
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddd %1, %2
-	pshufd %2, %1, 0B1h	; 10110001 B
-	paddd %1, %2
-%endmacro	; END OF SUM_SQR_SSE2
+%macro  WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
+  movdqa        xmm1,   [%1]
+  movdqa        xmm2,   [%2]
+  movdqa        xmm3,   xmm1
+  psadbw        xmm3,   xmm2
+  paddd         xmm6,   xmm3
 
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $04
-%endmacro
+  movdqa        xmm3,   xmm1
+  psadbw        xmm3,   xmm0
+  paddd         xmm5,   xmm3
 
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $04
-%endmacro
+  movdqa        xmm2,   xmm1
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm2,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm2,   xmm2
+  paddd         xmm4,   xmm1
+  paddd         xmm4,   xmm2
 
-%macro WELS_SAD_16x2_SSE2  0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	[esi+ebx]
-	movdqa	xmm4,	[edi+ebx]
-	psadbw	xmm1,	xmm2
-	psadbw	xmm3,	xmm4
-	paddd	xmm6,	xmm1
-	paddd	xmm6,	xmm3
-	lea		esi,	[esi+ebx*2]
-	lea		edi,	[edi+ebx*2]
+  add           %1,     %3
+  add           %2,     %3
 %endmacro
 
-%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm6,	xmm3
+%macro  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
+  movdqa        xmm1,   [%1]
+  movdqa        xmm2,   [%2]
+  movdqa        xmm3,   xmm1
+  psadbw        xmm3,   xmm2
+  paddd         xmm7,   xmm3    ; sad
 
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm0
-	paddd	xmm5,	xmm3
+  movdqa        xmm3,   xmm1
+  pmaxub        xmm3,   xmm2
+  pminub        xmm2,   xmm1
+  psubb xmm3,   xmm2    ; diff
 
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm2
+  movdqa        xmm2,   xmm1
+  psadbw        xmm2,   xmm0
+  paddd xmm6,   xmm2    ; sum
 
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
+  movdqa                xmm2,   xmm1
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm2,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm2,   xmm2
+  paddd         xmm5,   xmm1
+  paddd         xmm5,   xmm2    ; sqsum
 
-%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm7,	xmm3	; sad
+  movdqa                xmm1,   xmm3
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm3,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm3,   xmm3
+  paddd         xmm4,   xmm1
+  paddd         xmm4,   xmm3    ; sqdiff
 
-	movdqa	xmm3,	xmm1
-	pmaxub	xmm3,	xmm2
-	pminub	xmm2,	xmm1
-	psubb	xmm3,	xmm2	; diff
-
-	movdqa	xmm2,	xmm1
-	psadbw	xmm2,	xmm0
-	paddd	xmm6,	xmm2	; sum
-
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm5,	xmm1
-	paddd		xmm5,	xmm2	; sqsum
-
-	movdqa		xmm1,	xmm3
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm3	; sqdiff
-
-	add		esi,	ebx
-	add		edi,	ebx
+  add           %1,     %3
+  add           %2,     %3
 %endmacro
 
-%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
-%define sad_reg			%1
-%define	sum_cur_reg		%2
-%define sum_ref_reg		%3
-%define	mad_reg			%4
-	movdqa	xmm1,		[esi]
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_cur_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	paddd	sum_ref_reg,			xmm3	; sum_ref
+%macro  WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg                 %1
+%define sum_cur_reg             %2
+%define sum_ref_reg             %3
+%define mad_reg                 %4
+  movdqa        xmm1,           [%5]
+  movdqa        xmm2,           [%6]
+  movdqa        xmm3,           xmm1
+  psadbw        xmm3,           xmm0
+  paddd         sum_cur_reg,    xmm3    ; sum_cur
+  movdqa        xmm3,           xmm2
+  psadbw        xmm3,           xmm0
+  paddd sum_ref_reg,                    xmm3    ; sum_ref
 
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
+  movdqa        xmm3,           xmm1
+  pmaxub        xmm3,           xmm2
+  pminub        xmm2,           xmm1
+  psubb xmm3,           xmm2    ; abs diff
+  pmaxub        mad_reg,        xmm3    ; max abs diff
 
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
+  psadbw        xmm3,           xmm0
+  paddd sad_reg,        xmm3    ; sad
 
-	add			esi,		ebx
-	add			edi,		ebx
+  add                   %5,             %7
+  add                   %6,             %7
 %endmacro
 
 
-%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
+%macro  WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
 %define max_reg  %1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		4
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		2
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		1
-	pmaxub	max_reg,	xmm1
+  movdqa        xmm1,           max_reg
+  psrldq        xmm1,           4
+  pmaxub        max_reg,        xmm1
+  movdqa        xmm1,           max_reg
+  psrldq        xmm1,           2
+  pmaxub        max_reg,        xmm1
+  movdqa        xmm1,           max_reg
+  psrldq        xmm1,           1
+  pmaxub        max_reg,        xmm1
 %endmacro
 
-%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
-%define sad_reg		%1
-%define	sum_reg		%2
-%define mad_reg		%3
-%define sqdiff_reg	%4
-	movdqa		xmm1,		[esi]
-	movdqa		xmm2,		xmm1
-	movdqa		xmm3,		xmm1
-	punpcklbw	xmm2,		xmm0
-	punpckhbw	xmm3,		xmm0
-	pmaddwd		xmm2,		xmm2
-	pmaddwd		xmm3,		xmm3
-	paddd		xmm2,		xmm3
-	movdqa		xmm3,		xmm2
-	psllq		xmm2,		32
-	psrlq		xmm3,		32
-	psllq		xmm3,		32
-	paddd		xmm2,		xmm3
-	paddd		sad_reg,	xmm2		; sqsum
+%macro  WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg         %1
+%define sum_reg         %2
+%define mad_reg         %3
+%define sqdiff_reg      %4
+  movdqa                xmm1,           [%5]
+  movdqa                xmm2,           xmm1
+  movdqa                xmm3,           xmm1
+  punpcklbw     xmm2,           xmm0
+  punpckhbw     xmm3,           xmm0
+  pmaddwd               xmm2,           xmm2
+  pmaddwd               xmm3,           xmm3
+  paddd         xmm2,           xmm3
+  movdqa                xmm3,           xmm2
+  psllq         xmm2,           32
+  psrlq         xmm3,           32
+  psllq         xmm3,           32
+  paddd         xmm2,           xmm3
+  paddd         sad_reg,        xmm2            ; sqsum
 
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	pslldq	xmm3,		4
-	paddd	sum_reg,			xmm3	; sum_ref
+  movdqa        xmm2,           [%6]
+  movdqa        xmm3,           xmm1
+  psadbw        xmm3,           xmm0
+  paddd sum_reg,                        xmm3    ; sum_cur
+  movdqa        xmm3,           xmm2
+  psadbw        xmm3,           xmm0
+  pslldq        xmm3,           4
+  paddd sum_reg,                        xmm3    ; sum_ref
 
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
+  movdqa        xmm3,           xmm1
+  pmaxub        xmm3,           xmm2
+  pminub        xmm2,           xmm1
+  psubb xmm3,           xmm2    ; abs diff
+  pmaxub        mad_reg,        xmm3    ; max abs diff
 
-	movdqa	xmm1,		xmm3
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
+  movdqa        xmm1,           xmm3
+  psadbw        xmm3,           xmm0
+  paddd sad_reg,        xmm3    ; sad
 
-	movdqa		xmm3,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		sqdiff_reg,	xmm1
-	paddd		sqdiff_reg,	xmm3	; sqdiff
+  movdqa                xmm3,   xmm1
+  punpcklbw     xmm1,   xmm0
+  punpckhbw     xmm3,   xmm0
+  pmaddwd               xmm1,   xmm1
+  pmaddwd               xmm3,   xmm3
+  paddd         sqdiff_reg,     xmm1
+  paddd         sqdiff_reg,     xmm3    ; sqdiff
 
-	add		esi,	ebx
-	add		edi,	ebx
+  add           %5,     %7
+  add           %6,     %7
 %endmacro
 
 
@@ -325,7 +245,7 @@
 
 ;ALIGN 16
 ;pack1_8x2:
-;	dw 1, 1, 1, 1, 1, 1, 1, 1
+;       dw 1, 1, 1, 1, 1, 1, 1, 1
 
 ;***********************************************************************
 ; Code
@@ -333,1082 +253,1805 @@
 
 SECTION .text
 
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
-	push esi
-	push edi
-	push ebp
-	push ebx
-	push edx
+%ifdef X86_32
 
-	mov esi, [esp+24]
-	mov edi, [esp+28]
-	mov ebx, [esp+32]
-	mov ecx, [esp+36]
-	mov edx, [esp+40]
-	pxor xmm0, xmm0
-.hloop:
-	mov eax, ebx
-	mov ebp, $00
-.wloop:
-	movdqa xmm1, [esi+ebp]
-	movdqa xmm2, [edi+ebp]
-	psadbw xmm1, xmm2
-	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
-	paddd xmm1, xmm2
-	paddd xmm0, xmm1
-	add ebp, 010h
-	dec eax
-	jnz near .wloop
-	lea esi, [esi+edx]
-	lea edi, [edi+edx]
-	dec ecx
-	jnz near .hloop
-
-	movd eax, xmm0
-	pop edx
-	pop ebx
-	pop ebp
-	pop edi
-	pop esi
-	ret
-
-
 WELS_EXTERN SampleVariance16x16_sse2
 ;***********************************************************************
-;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
 ALIGN 16
 SampleVariance16x16_sse2:
-	push esi
-	push edi
-	push ebx
+  push esi
+  push edi
+  push ebx
 
-	sub esp, 16
-	%define SUM			[esp]
-	%define SUM_CUR		[esp+4]
-	%define SQR			[esp+8]
-	%define SQR_CUR		[esp+12]
-	%define PUSH_SIZE	28	; 12 + 16
+  sub esp, 16
+  %define SUM                   [esp]
+  %define SUM_CUR               [esp+4]
+  %define SQR                   [esp+8]
+  %define SQR_CUR               [esp+12]
+  %define PUSH_SIZE     28      ; 12 + 16
 
-	mov edi, [esp+PUSH_SIZE+4]	; y_ref
-	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride
-	mov esi, [esp+PUSH_SIZE+12]	; y_src
-	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
-	mov ecx, 010h				; height = 16
+  mov edi, [esp+PUSH_SIZE+4]    ; y_ref
+  mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
+  mov esi, [esp+PUSH_SIZE+12]   ; y_src
+  mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
+  mov ecx, 010h                         ; height = 16
 
-	pxor xmm7, xmm7
-	movdqu SUM, xmm7
+  pxor xmm7, xmm7
+  movdqu SUM, xmm7
 
 .hloops:
-	movdqa xmm0, [edi]		; y_ref
-	movdqa xmm1, [esi]		; y_src
-	movdqa xmm2, xmm0		; store first for future process
-	movdqa xmm3, xmm1
-	; sum += diff;
-	movdqa xmm4, xmm0
-	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
-	; to be continued for sum
-	pshufd xmm5, xmm4, 0C6h	; 11000110 B
-	paddw xmm4, xmm5
-	movd ebx, xmm4
-	add SUM, ebx
+  movdqa xmm0, [edi]            ; y_ref
+  movdqa xmm1, [esi]            ; y_src
+  movdqa xmm2, xmm0             ; store first for future process
+  movdqa xmm3, xmm1
+  ; sum += diff;
+  movdqa xmm4, xmm0
+  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+  ; to be continued for sum
+  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+  paddw xmm4, xmm5
+  movd ebx, xmm4
+  add SUM, ebx
 
-	; sqr += diff * diff;
-	pmaxub xmm0, xmm1
-	pminub xmm1, xmm2
-	psubb xmm0, xmm1				; diff
-	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
-	movd ebx, xmm1
-	add SQR, ebx
+  ; sqr += diff * diff;
+  pmaxub xmm0, xmm1
+  pminub xmm1, xmm2
+  psubb xmm0, xmm1                              ; diff
+  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+  movd ebx, xmm1
+  add SQR, ebx
 
-	; sum_cur += y_src[x];
-	movdqa xmm0, xmm3		; cur_orig
-	movdqa xmm1, xmm0
-	punpcklbw xmm0, xmm7
-	punpckhbw xmm1, xmm7
-	paddw xmm0, xmm1		; 8x2
-	SUM_WORD_8x2_SSE2 xmm0, xmm1
-	movd ebx, xmm0
-	and ebx, 0ffffh
-	add SUM_CUR, ebx
+  ; sum_cur += y_src[x];
+  movdqa xmm0, xmm3             ; cur_orig
+  movdqa xmm1, xmm0
+  punpcklbw xmm0, xmm7
+  punpckhbw xmm1, xmm7
+  paddw xmm0, xmm1              ; 8x2
+  SUM_WORD_8x2_SSE2 xmm0, xmm1
+  movd ebx, xmm0
+  and ebx, 0ffffh
+  add SUM_CUR, ebx
 
-	; sqr_cur += y_src[x] * y_src[x];
-	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
-	movd ebx, xmm0
-	add SQR_CUR, ebx
+  ; sqr_cur += y_src[x] * y_src[x];
+  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+  movd ebx, xmm0
+  add SQR_CUR, ebx
 
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .hloops
+  lea edi, [edi+edx]
+  lea esi, [esi+eax]
+  dec ecx
+  jnz near .hloops
 
-	mov ebx, 0
-	mov bx, word SUM
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR
-	sar ecx, 8
-	sub ecx, ebx
-	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
-	mov [edi], cx				; to store uiMotionIndex
-	mov ebx, 0
-	mov bx, word SUM_CUR
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR_CUR
-	sar ecx, 8
-	sub ecx, ebx
-	mov [edi+2], cx				; to store uiTextureIndex
+  mov ebx, 0
+  mov bx, word SUM
+  sar ebx, 8
+  imul ebx, ebx
+  mov ecx, SQR
+  sar ecx, 8
+  sub ecx, ebx
+  mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
+  mov [edi], cx                         ; to store uiMotionIndex
+  mov ebx, 0
+  mov bx, word SUM_CUR
+  sar ebx, 8
+  imul ebx, ebx
+  mov ecx, SQR_CUR
+  sar ecx, 8
+  sub ecx, ebx
+  mov [edi+2], cx                               ; to store uiTextureIndex
 
-	%undef SUM
-	%undef SUM_CUR
-	%undef SQR
-	%undef SQR_CUR
-	%undef PUSH_SIZE
+  %undef SUM
+  %undef SUM_CUR
+  %undef SQR
+  %undef SQR_CUR
+  %undef PUSH_SIZE
 
-	add esp, 16
-	pop ebx
-	pop edi
-	pop esi
+  add esp, 16
+  pop ebx
+  pop edi
+  pop esi
 
-	ret
+  ret
 
-; , 6/7/2010
 
 
-WELS_EXTERN abs_difference_mbrow_sse2
+WELS_EXTERN VAACalcSad_sse2
 ;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
-;								 int32_t gom_pixel_num, int32_t *pSum)
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
 ;*************************************************************************************************************
+
+
 ALIGN 16
-abs_difference_mbrow_sse2:
-%define		ref_orig			esp + pushsize + 4
-%define		cur_orig			esp + pushsize + 8
-%define		iPicStride			esp + pushsize + 12
-%define		gom_pixel_num		esp + pushsize + 16
-%define		pSum				esp + pushsize + 20
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[ref_orig]
-	mov		edi,	[cur_orig]
-	mov		ebx,	[iPicStride]
-	mov		eax,	[gom_pixel_num]
-	mov		ecx,	16					;MB_WIDTH_LUMA
-	pxor	xmm0,	xmm0
-mb_width_loop_p:
-	mov		edx,	esi
-	add		edx,	eax			; end address
-gom_row_loop_p:
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	psadbw	xmm1,	xmm2
-	paddd	xmm0,	xmm1
-	add		esi,	16
-	add		edi,	16
-	cmp		esi,	edx
-	jl		gom_row_loop_p
+VAACalcSad_sse2:
+%define         cur_data                        esp + pushsize + 4
+%define         ref_data                        esp + pushsize + 8
+%define         iPicWidth                       esp + pushsize + 12
+%define         iPicHeight                      esp + pushsize + 16
+%define         iPicStride                      esp + pushsize + 20
+%define         psadframe                       esp + pushsize + 24
+%define         psad8x8                         esp + pushsize + 28
+%define         pushsize        12
+  push  esi
+  push  edi
+  push  ebx
+  mov           esi,    [cur_data]
+  mov           edi,    [ref_data]
+  mov           ebx,    [iPicStride]
+  mov           edx,    [psad8x8]
+  mov           eax,    ebx
 
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	ebx
-	add		edi,	ebx
-	loop	mb_width_loop_p
+  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl           eax,    4                                                               ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
+height_loop:
+  mov           ecx,    dword [iPicWidth]
+  push  esi
+  push  edi
+width_loop:
+  pxor  xmm6,   xmm6            ;
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx],          xmm6
+  psrldq        xmm6,           8
+  movd  [edx+4],        xmm6
 
-	movdqa	xmm1,	xmm0
-	psrldq	xmm1,	8
-	paddd	xmm1,	xmm0
-	movd	eax,	xmm1
-	mov		edx,	[pSum]	; pSum
-	add		[edx],	eax
+  pxor  xmm6,   xmm6
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  WELS_SAD_16x2_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx+8],        xmm6
+  psrldq        xmm6,           8
+  movd  [edx+12],       xmm6
 
-%undef		ref_orig
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pushsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
+  add           edx,    16
+  sub           esi,    eax
+  sub           edi,    eax
+  add           esi,    16
+  add           edi,    16
 
+  dec           ecx
+  jnz           width_loop
 
+  pop           edi
+  pop           esi
+  add           esi,    eax
+  add           edi,    eax
 
+  dec   dword [iPicHeight]
+  jnz           height_loop
 
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
-;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
+  mov           edx,    [psadframe]
+  movdqa        xmm5,   xmm7
+  psrldq        xmm7,   8
+  paddd xmm7,   xmm5
+  movd  [edx],  xmm7
+
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          pushsize
+  pop           ebx
+  pop           edi
+  pop           esi
+  ret
+
+%else  ;64-bit
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
 ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define		cur_orig			esp + pushsize + 4
-%define		iPicStride			esp + pushsize + 8
-%define		gom_pixel_num		esp + pushsize + 12
-%define		pSum				esp + pushsize + 16
-%define		pSqrSum				esp + pushsize + 20
-%define		pushsize			8
-	push		esi
-	push		ebx
-	mov			esi,	[cur_orig]
-	mov			eax,	[gom_pixel_num]
-	mov			ebx,	[iPicStride]
-	mov			ecx,	16					;MB_WIDTH_LUMA
-	pxor		xmm0,	xmm0				; zero
-	pxor		xmm1,	xmm1				; sum
-	pxor		xmm2,	xmm2				; sqr sum
-mb_width_loop_i:
-	mov			edx,	esi
-	add			edx,	eax			; end address
-gom_row_loop_i:
-	movdqa		xmm3,	[esi]
-	movdqa		xmm4,	xmm3
-	psadbw		xmm4,	xmm0
-	paddd		xmm1,	xmm4
-	movdqa		xmm4,	xmm3
-	punpcklbw	xmm4,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm4,	xmm4
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm2,	xmm3
-	paddd		xmm2,	xmm4
-	add			esi,	16
-	cmp			esi,	edx
-	jl			gom_row_loop_i
+SampleVariance16x16_sse2:
+  %define SUM                   r10;[esp]
+  %define SUM_CUR               r11;[esp+4]
+  %define SQR                   r13;[esp+8]
+  %define SQR_CUR               r15;[esp+12]
 
-	sub			esi,	eax
-	add			esi,	ebx
-	loop		mb_width_loop_i
+  push r12
+  push r13
+  push r14
+  push r15
+  %assign push_num 4
+  LOAD_5_PARA
+  SIGN_EXTENTION r1,r1d
+  SIGN_EXTENTION r3,r3d
 
-	movdqa		xmm3,	xmm1
-	psrldq		xmm3,	8
-	paddd		xmm1,	xmm3
-	movd		eax,	xmm1
-	mov			edx,	[pSum]
-	add			[edx],	eax
+  mov r12,010h
+  pxor xmm7, xmm7
+  movq SUM, xmm7
+  movq SUM_CUR,xmm7
+  movq SQR,xmm7
+  movq SQR_CUR,xmm7
 
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	8
-	paddd		xmm2,	xmm3
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	4
-	paddd		xmm2,	xmm3
-	movd		eax,	xmm2
-	mov			edx,	[pSqrSum]
-	add			[edx],	eax
+.hloops:
+  mov r14,0
+  movdqa xmm0, [r0]             ; y_ref
+  movdqa xmm1, [r2]             ; y_src
+  movdqa xmm2, xmm0             ; store first for future process
+  movdqa xmm3, xmm1
+  ; sum += diff;
+  movdqa xmm4, xmm0
+  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+  ; to be continued for sum
+  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+  paddw xmm4, xmm5
+  movd r14d, xmm4
+  add SUM, r14
 
+  ; sqr += diff * diff;
+  pmaxub xmm0, xmm1
+  pminub xmm1, xmm2
+  psubb xmm0, xmm1                              ; diff
+  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+  movd r14d, xmm1
+  add SQR, r14
 
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pSqrSum
-%undef		pushsize
-	pop			ebx
-	pop			esi
-	ret
+  ; sum_cur += y_src[x];
+  movdqa xmm0, xmm3             ; cur_orig
+  movdqa xmm1, xmm0
+  punpcklbw xmm0, xmm7
+  punpckhbw xmm1, xmm7
+  paddw xmm0, xmm1              ; 8x2
+  SUM_WORD_8x2_SSE2 xmm0, xmm1
+  movd r14d, xmm0
+  and r14, 0ffffh
+  add SUM_CUR, r14
 
+  ; sqr_cur += y_src[x] * y_src[x];
+  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+  movd r14d, xmm0
+  add SQR_CUR, r14
 
+  lea r0, [r0+r1]
+  lea r2, [r2+r3]
+  dec r12
+  jnz near .hloops
 
+  mov r0, SUM
+  sar r0, 8
+  imul r0, r0
+  mov r1, SQR
+  sar r1, 8
+  sub r1, r0
+  mov [r4], r1w                         ; to store uiMotionIndex
+  mov r0, SUM_CUR
+  sar r0, 8
+  imul r0, r0
+  mov r1, SQR_CUR
+  sar r1, 8
+  sub r1, r0
+  mov [r4+2], r1w                               ; to store uiTextureIndex
+
+  LOAD_5_PARA_POP
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+
+
+  %assign push_num 0
+
+  ret
+
+
 WELS_EXTERN VAACalcSad_sse2
 ;*************************************************************************************************************
 ;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;                                                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
 ;*************************************************************************************************************
 
 
 ALIGN 16
 VAACalcSad_sse2:
-%define		cur_data			esp + pushsize + 4
-%define		ref_data			esp + pushsize + 8
-%define		iPicWidth			esp + pushsize + 12
-%define		iPicHeight			esp + pushsize + 16
-%define		iPicStride			esp + pushsize + 20
-%define		psadframe			esp + pushsize + 24
-%define		psad8x8				esp + pushsize + 28
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
+%define         cur_data                        r0
+%define         ref_data                        r1
+%define         iPicWidth                       r2
+%define         iPicHeight              r3
+%define         iPicStride              r4
+%define         psadframe                       r5
+%define         psad8x8                         r6
 
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4								; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
+  push r12
+  push r13
+  %assign push_num 2
+  LOAD_7_PARA
+  SIGN_EXTENTION r2,r2d
+  SIGN_EXTENTION r3,r3d
+  SIGN_EXTENTION r4,r4d
+
+  mov   r12,r4
+  shr           r2,     4                                       ; iPicWidth/16
+  shr           r3,     4                                       ; iPicHeight/16
+
+  shl           r12,    4                                                               ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
 height_loop:
-	mov		ecx,	dword [iPicWidth]
-	push	esi
-	push	edi
+  mov           r13,    r2
+  push  r0
+  push  r1
 width_loop:
-	pxor	xmm6,	xmm6		;
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
+  pxor  xmm6,   xmm6
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  paddd xmm7,           xmm6
+  movd  [r6],           xmm6
+  psrldq        xmm6,           8
+  movd  [r6+4], xmm6
 
-	pxor	xmm6,	xmm6
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
+  pxor  xmm6,   xmm6
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  WELS_SAD_16x2_SSE2 r0,r1,r4
+  paddd xmm7,           xmm6
+  movd  [r6+8], xmm6
+  psrldq        xmm6,           8
+  movd  [r6+12],        xmm6
 
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
+  add           r6,     16
+  sub           r0,     r12
+  sub           r1,     r12
+  add           r0,     16
+  add           r1,     16
 
-	dec		ecx
-	jnz		width_loop
+  dec           r13
+  jnz           width_loop
 
-	pop		edi
-	pop		esi
-	add		esi,	eax
-	add		edi,	eax
+  pop           r1
+  pop           r0
+  add           r0,     r12
+  add           r1,     r12
 
-	dec	dword [iPicHeight]
-	jnz		height_loop
+  dec   r3
+  jnz           height_loop
 
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
+  ;mov          r13,    [psadframe]
+  movdqa        xmm5,   xmm7
+  psrldq        xmm7,   8
+  paddd xmm7,   xmm5
+  movd  [psadframe],    xmm7
 
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		pushsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          pushsize
+  LOAD_7_PARA_POP
+  pop r13
+  pop r12
+  %assign push_num 0
+  ret
 
+%endif
 
+
+%ifdef X86_32
 WELS_EXTERN VAACalcSadVar_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
 ;*************************************************************************************************************
 
 
 ALIGN 16
 VAACalcSadVar_sse2:
-%define		localsize		8
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
+%define         localsize               8
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         psum16x16                       esp + pushsize + localsize + 32
+%define         psqsum16x16                     esp + pushsize + localsize + 36
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         pushsize                16
+  push  ebp
+  push  esi
+  push  edi
+  push  ebx
+  sub           esp,    localsize
+  mov           esi,    [cur_data]
+  mov           edi,    [ref_data]
+  mov           ebx,    [iPicStride]
+  mov           edx,    [psad8x8]
+  mov           eax,    ebx
 
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
+  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl           eax,    4                                                       ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
 var_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
+  mov           ecx,    dword [iPicWidth]
+  mov           [tmp_esi],      esi
+  mov           [tmp_edi],      edi
 var_width_loop:
-	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
-	pxor	xmm5,	xmm5		; pSum16x16
-	pxor	xmm4,	xmm4		; sqsum_16x16
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
+  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+  pxor  xmm5,   xmm5            ; pSum16x16
+  pxor  xmm4,   xmm4            ; sqsum_16x16
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx],          xmm6
+  psrldq        xmm6,           8
+  movd  [edx+4],        xmm6
 
-	pxor	xmm6,	xmm6
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
+  pxor  xmm6,   xmm6
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+  paddd xmm7,           xmm6
+  movd  [edx+8],        xmm6
+  psrldq        xmm6,           8
+  movd  [edx+12],       xmm6
 
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm5
-	psrldq	xmm1,	8
-	paddd	xmm5,	xmm1
-	movd	[ebp],	xmm5
-	add		dword [psum16x16], 4
+  mov           ebp,    [psum16x16]
+  movdqa        xmm1,   xmm5
+  psrldq        xmm1,   8
+  paddd xmm5,   xmm1
+  movd  [ebp],  xmm5
+  add           dword [psum16x16], 4
 
-	movdqa	xmm5,	xmm4
-	psrldq	xmm5,	8
-	paddd	xmm4,	xmm5
-	movdqa	xmm3,	xmm4
-	psrldq	xmm3,	4
-	paddd	xmm4,	xmm3
+  movdqa        xmm5,   xmm4
+  psrldq        xmm5,   8
+  paddd xmm4,   xmm5
+  movdqa        xmm3,   xmm4
+  psrldq        xmm3,   4
+  paddd xmm4,   xmm3
+
+  mov           ebp,    [psqsum16x16]
+  movd  [ebp],  xmm4
+  add           dword [psqsum16x16], 4
+
+  add           edx,    16
+  sub           esi,    eax
+  sub           edi,    eax
+  add           esi,    16
+  add           edi,    16
+
+  dec           ecx
+  jnz           var_width_loop
+
+  mov           esi,    [tmp_esi]
+  mov           edi,    [tmp_edi]
+  add           esi,    eax
+  add           edi,    eax
+
+  dec   dword [iPicHeight]
+  jnz           var_height_loop
+
+  mov           edx,    [psadframe]
+  movdqa        xmm5,   xmm7
+  psrldq        xmm7,   8
+  paddd xmm7,   xmm5
+  movd  [edx],  xmm7
+
+  add           esp,    localsize
+  pop           ebx
+  pop           edi
+  pop           esi
+  pop           ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+
+%else  ;64-bit
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define         cur_data                        arg1 ;r0
+%define         ref_data                        arg2 ;r1
+%define         iPicWidth                       arg3 ;r2
+%define         iPicHeight                  arg4 ;r3
+%define         iPicStride                  arg5
+%define         psadframe                       arg6
+%define         psad8x8                         arg7
+%define         psum16x16                       arg8
+%define         psqsum16x16                 arg9
+
+  push r12
+  push r13
+  push r14
+  push r15
+  %assign push_num 4
+
+%ifdef WIN64
+  mov r4, arg5  ;iPicStride
+  mov r5, arg6  ;psad8x8
+%endif
+  mov r14,arg7
+  SIGN_EXTENTION r2,r2d
+  SIGN_EXTENTION r3,r3d
+  SIGN_EXTENTION r4,r4d
+
+  mov   r13,r4
+  shr   r2,4
+  shr   r3,4
+
+  shl   r13,4   ; iPicStride*16
+  pxor  xmm0,   xmm0
+  pxor  xmm7,   xmm7            ; iFrameSad
+var_height_loop:
+  push    r2
+  %assign push_num push_num+1
+  mov           r11,    r0
+  mov           r12,    r1
+var_width_loop:
+  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+  pxor  xmm5,   xmm5            ; pSum16x16
+  pxor  xmm4,   xmm4            ; sqsum_16x16
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  paddd xmm7,           xmm6
+  movd  [r14],          xmm6
+  psrldq        xmm6,           8
+  movd  [r14+4],        xmm6
+
+  pxor  xmm6,   xmm6
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+  paddd   xmm7,           xmm6
+  movd    [r14+8],        xmm6
+  psrldq  xmm6,           8
+  movd    [r14+12],       xmm6
+
+  mov             r15,    psum16x16
+  movdqa  xmm1,   xmm5
+  psrldq  xmm1,   8
+  paddd   xmm5,   xmm1
+  movd    [r15],  xmm5
+  add             dword psum16x16, 4
+
+  movdqa  xmm5,   xmm4
+  psrldq  xmm5,   8
+  paddd   xmm4,   xmm5
+  movdqa  xmm3,   xmm4
+  psrldq  xmm3,   4
+  paddd   xmm4,   xmm3
+
+  mov             r15,    psqsum16x16
+  movd    [r15],  xmm4
+  add             dword psqsum16x16, 4
+
+  add             r14,16
+  sub             r0,     r13
+  sub             r1,     r13
+  add             r0,     16
+  add             r1,     16
+
+  dec             r2
+  jnz             var_width_loop
+
+  pop     r2
+  %assign push_num push_num-1
+  mov             r0,     r11
+  mov             r1,     r12
+  add             r0,     r13
+  add             r1,     r13
+  dec     r3
+  jnz             var_height_loop
+
+  mov             r15,    psadframe
+  movdqa  xmm5,   xmm7
+  psrldq  xmm7,   8
+  paddd   xmm7,   xmm5
+  movd    [r15],  xmm7
+
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+%assign push_num 0
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
 
-	mov		ebp,	[psqsum16x16]
-	movd	[ebp],	xmm4
-	add		dword [psqsum16x16], 4
+%endif
 
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
+%ifdef X86_32
 
-	dec		ecx
-	jnz		var_width_loop
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
 
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
 
-	dec	dword [iPicHeight]
-	jnz		var_height_loop
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define         localsize               12
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         psum16x16                       esp + pushsize + localsize + 32
+%define         psqsum16x16                     esp + pushsize + localsize + 36
+%define         psqdiff16x16            esp + pushsize + localsize + 40
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         tmp_sadframe            esp + 8
+%define         pushsize                16
+  push    ebp
+  push    esi
+  push    edi
+  push    ebx
+  sub             esp,    localsize
 
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
+  mov             ecx,    [iPicWidth]
+  mov             ecx,    [iPicHeight]
+  mov             esi,    [cur_data]
+  mov             edi,    [ref_data]
+  mov             ebx,    [iPicStride]
+  mov             edx,    [psad8x8]
+  mov             eax,    ebx
 
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
+  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl             eax,    4                                                       ; iPicStride*16
+  mov             ecx,    [iPicWidth]
+  mov             ecx,    [iPicHeight]
+  pxor    xmm0,   xmm0
+  movd    [tmp_sadframe], xmm0
+sqdiff_height_loop:
+  mov             ecx,    dword [iPicWidth]
+  mov             [tmp_esi],      esi
+  mov             [tmp_edi],      edi
+sqdiff_width_loop:
+  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+  pxor    xmm6,   xmm6            ; pSum16x16
+  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  movdqa  xmm1,           xmm7
+  movd    [edx],          xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [edx+4],        xmm7
+  movd    ebp,            xmm1
+  add             [tmp_sadframe], ebp
 
+  pxor    xmm7,   xmm7
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+  movdqa  xmm1,           xmm7
+  movd    [edx+8],        xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [edx+12],       xmm7
+  movd    ebp,            xmm1
+  add             [tmp_sadframe], ebp
 
+  mov             ebp,    [psum16x16]
+  movdqa  xmm1,   xmm6
+  psrldq  xmm1,   8
+  paddd   xmm6,   xmm1
+  movd    [ebp],  xmm6
+  add             dword [psum16x16], 4
 
+  mov             ebp,    [psqsum16x16]
+  pshufd  xmm6,   xmm5,   14 ;00001110
+  paddd   xmm6,   xmm5
+  pshufd  xmm5,   xmm6,   1  ;00000001
+  paddd   xmm5,   xmm6
+  movd    [ebp],  xmm5
+  add             dword [psqsum16x16], 4
+
+  mov             ebp,    [psqdiff16x16]
+  pshufd  xmm5,   xmm4,   14      ; 00001110
+  paddd   xmm5,   xmm4
+  pshufd  xmm4,   xmm5,   1       ; 00000001
+  paddd   xmm4,   xmm5
+  movd    [ebp],  xmm4
+  add             dword   [psqdiff16x16], 4
+
+  add             edx,    16
+  sub             esi,    eax
+  sub             edi,    eax
+  add             esi,    16
+  add             edi,    16
+
+  dec             ecx
+  jnz             sqdiff_width_loop
+
+  mov             esi,    [tmp_esi]
+  mov             edi,    [tmp_edi]
+  add             esi,    eax
+  add             edi,    eax
+
+  dec     dword [iPicHeight]
+  jnz             sqdiff_height_loop
+
+  mov             ebx,    [tmp_sadframe]
+  mov             eax,    [psadframe]
+  mov             [eax],  ebx
+
+  add             esp,    localsize
+  pop             ebx
+  pop             edi
+  pop             esi
+  pop             ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          tmp_sadframe
+%undef          pushsize
+%undef          localsize
+  ret
+
+%else
+
+
 WELS_EXTERN VAACalcSadSsd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;       int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
 ;*************************************************************************************************************
 
 
 ALIGN 16
 VAACalcSadSsd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
+%define         localsize               12
+%define         cur_data                        arg1;r0
+%define         ref_data                        arg2;r1
+%define         iPicWidth                       arg3;r2
+%define         iPicHeight                      arg4;r3
+%define         iPicStride                      arg5;
+%define         psadframe                       arg6;
+%define         psad8x8                         arg7;
+%define         psum16x16                       arg8;
+%define         psqsum16x16                     arg9;
+%define         psqdiff16x16                    arg10
 
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
+  push r12
+  push r13
+  push r14
+  push r15
+  %assign push_num 4
+
+%ifdef WIN64
+  mov r4,arg5
+%endif
+  mov r14,arg7
+  SIGN_EXTENTION r2,r2d
+  SIGN_EXTENTION r3,r3d
+  SIGN_EXTENTION r4,r4d
+
+  mov        r13,r4
+  shr     r2,4   ; iPicWidth/16
+  shr     r3,4   ; iPicHeight/16
+  shl     r13,4   ; iPicStride*16
+  pxor    xmm0,   xmm0
+  pxor  xmm8, xmm8  ;framesad
+  pxor  xmm9, xmm9
 sqdiff_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
+  ;mov            ecx,    dword [iPicWidth]
+  ;mov      r14,r2
+  push r2
+  %assign push_num push_num +1
+  mov             r10,    r0
+  mov             r11,    r1
 sqdiff_width_loop:
-	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
-	pxor	xmm6,	xmm6		; pSum16x16
-	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx],		xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+4],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
+  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+  pxor    xmm6,   xmm6            ; pSum16x16
+  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  movdqa  xmm1,           xmm7
+  movd    [r14],          xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [r14+4],        xmm7
+  movd    r15d,           xmm1
+  movd  xmm9, r15d
+  paddd xmm8,xmm9
 
-	pxor	xmm7,	xmm7
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx+8],	xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+12],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
 
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm6
-	psrldq	xmm1,	8
-	paddd	xmm6,	xmm1
-	movd	[ebp],	xmm6
-	add		dword [psum16x16], 4
+  pxor    xmm7,   xmm7
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+  movdqa  xmm1,           xmm7
+  movd    [r14+8],        xmm7
+  psrldq  xmm7,           8
+  paddd   xmm1,           xmm7
+  movd    [r14+12],       xmm7
+  movd    r15d,           xmm1
+  movd  xmm9, r15d
+  paddd xmm8,xmm9
 
-	mov		ebp,	[psqsum16x16]
-	pshufd	xmm6,	xmm5,	14 ;00001110
-	paddd	xmm6,	xmm5
-	pshufd	xmm5,	xmm6,	1  ;00000001
-	paddd	xmm5,	xmm6
-	movd	[ebp],	xmm5
-	add		dword [psqsum16x16], 4
+  mov             r15,    psum16x16
+  movdqa  xmm1,   xmm6
+  psrldq  xmm1,   8
+  paddd   xmm6,   xmm1
+  movd    [r15],  xmm6
+  add             dword psum16x16, 4
 
-	mov		ebp,	[psqdiff16x16]
-	pshufd	xmm5,	xmm4,	14	; 00001110
-	paddd	xmm5,	xmm4
-	pshufd	xmm4,	xmm5,	1	; 00000001
-	paddd	xmm4,	xmm5
-	movd	[ebp],	xmm4
-	add		dword	[psqdiff16x16],	4
+  mov             r15,    psqsum16x16
+  pshufd  xmm6,   xmm5,   14 ;00001110
+  paddd   xmm6,   xmm5
+  pshufd  xmm5,   xmm6,   1  ;00000001
+  paddd   xmm5,   xmm6
+  movd    [r15],  xmm5
+  add             dword psqsum16x16, 4
 
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
+  mov             r15,    psqdiff16x16
+  pshufd  xmm5,   xmm4,   14      ; 00001110
+  paddd   xmm5,   xmm4
+  pshufd  xmm4,   xmm5,   1       ; 00000001
+  paddd   xmm4,   xmm5
+  movd    [r15],  xmm4
+  add             dword   psqdiff16x16,   4
 
-	dec		ecx
-	jnz		sqdiff_width_loop
+  add             r14,16
+  sub             r0,     r13
+  sub             r1,     r13
+  add             r0,     16
+  add             r1,     16
+
+  dec             r2
+  jnz             sqdiff_width_loop
+
+  pop r2
+  %assign push_num push_num -1
+
+  mov             r0,     r10
+  mov             r1,     r11
+  add             r0,     r13
+  add             r1,     r13
+
+  dec     r3
+  jnz             sqdiff_height_loop
+
+  mov             r13,    psadframe
+  movd    [r13],  xmm8
+
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+  %assign push_num 0
 
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          tmp_sadframe
+%undef          pushsize
+%undef          localsize
+  ret
 
-	dec	dword [iPicHeight]
-	jnz		sqdiff_height_loop
 
-	mov		ebx,	[tmp_sadframe]
-	mov		eax,	[psadframe]
-	mov		[eax],	ebx
 
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		tmp_sadframe
-%undef		pushsize
-%undef		localsize
-	ret
+%endif
 
+%ifdef X86_32
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
 
 
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define         localsize               12
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         p_sd8x8                         esp + pushsize + localsize + 32
+%define         p_mad8x8                        esp + pushsize + localsize + 36
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         tmp_ecx                         esp + 8
+%define         pushsize                16
+  push    ebp
+  push    esi
+  push    edi
+  push    ebx
+  sub             esp,    localsize
+  mov             esi,    [cur_data]
+  mov             edi,    [ref_data]
+  mov             ebx,    [iPicStride]
+  mov             eax,    ebx
 
+  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl             eax,    4                                                       ; iPicStride*16
+  xor             ebp,    ebp
+  pxor    xmm0,   xmm0
+bgd_height_loop:
+  mov             ecx,    dword [iPicWidth]
+  mov             [tmp_esi],      esi
+  mov             [tmp_edi],      edi
+bgd_width_loop:
+  pxor    xmm7,   xmm7            ; pSad8x8
+  pxor    xmm6,   xmm6            ; sum_cur_8x8
+  pxor    xmm5,   xmm5            ; sum_ref_8x8
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
 
+
+  mov                     edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm4
+
+  ;movdqa         xmm1,   xmm4
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm4,   xmm0
+  ;punpcklwd      xmm4,   xmm0
+  ;movd           [edx+4],        xmm4
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  mov                     [tmp_ecx],      ecx
+  movhlps         xmm1,   xmm4
+  movd            ecx,    xmm4
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+
+  pslldq          xmm7,   4
+  pslldq          xmm6,   4
+  pslldq          xmm5,   4
+
+
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+
+  mov                     edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm4
+
+  ;movdqa         xmm1,   xmm4
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm4,   xmm0
+  ;punpcklwd      xmm4,   xmm0
+  ;movd           [edx+4],        xmm4
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  movhlps         xmm1,   xmm4
+  movd            ecx,    xmm4
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+
+  mov             edx,    [psad8x8]
+  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+  movdqa  [edx],  xmm1
+  add             edx,    16
+  mov             [psad8x8],      edx                                     ; sad8x8
+
+  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+  pshufd  xmm2,   xmm1,   00000011b
+  paddd   xmm1,   xmm2
+  movd    edx,    xmm1
+  add             ebp,    edx                                             ; sad frame
+
+  mov             edx,    [p_sd8x8]
+  psubd   xmm6,   xmm5
+  pshufd  xmm1,   xmm6,   10001101b
+  movdqa  [edx],  xmm1
+  add             edx,    16
+  mov             [p_sd8x8],      edx
+
+
+  add             edx,    16
+  sub             esi,    eax
+  sub             edi,    eax
+  add             esi,    16
+  add             edi,    16
+
+  mov             ecx,    [tmp_ecx]
+  dec             ecx
+  jnz             bgd_width_loop
+
+  mov             esi,    [tmp_esi]
+  mov             edi,    [tmp_edi]
+  add             esi,    eax
+  add             edi,    eax
+
+  dec             dword [iPicHeight]
+  jnz             bgd_height_loop
+
+  mov             edx,    [psadframe]
+  mov             [edx],  ebp
+
+  add             esp,    localsize
+  pop             ebx
+  pop             edi
+  pop             esi
+  pop             ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define         localsize               16
+%define         cur_data                        esp + pushsize + localsize + 4
+%define         ref_data                        esp + pushsize + localsize + 8
+%define         iPicWidth                       esp + pushsize + localsize + 12
+%define         iPicHeight                      esp + pushsize + localsize + 16
+%define         iPicStride                      esp + pushsize + localsize + 20
+%define         psadframe                       esp + pushsize + localsize + 24
+%define         psad8x8                         esp + pushsize + localsize + 28
+%define         psum16x16                       esp + pushsize + localsize + 32
+%define         psqsum16x16                     esp + pushsize + localsize + 36
+%define         psqdiff16x16            esp + pushsize + localsize + 40
+%define         p_sd8x8                         esp + pushsize + localsize + 44
+%define         p_mad8x8                        esp + pushsize + localsize + 48
+%define         tmp_esi                         esp + 0
+%define         tmp_edi                         esp + 4
+%define         tmp_sadframe            esp + 8
+%define         tmp_ecx                         esp + 12
+%define         pushsize                16
+  push    ebp
+  push    esi
+  push    edi
+  push    ebx
+  sub             esp,    localsize
+  mov             esi,    [cur_data]
+  mov             edi,    [ref_data]
+  mov             ebx,    [iPicStride]
+  mov             eax,    ebx
+
+  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+  shl             eax,    4                                                       ; iPicStride*16
+  pxor    xmm0,   xmm0
+  movd    [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+  mov             ecx,    dword [iPicWidth]
+  mov             [tmp_esi],      esi
+  mov             [tmp_edi],      edi
+sqdiff_bgd_width_loop:
+  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+
+  mov             edx,            [psad8x8]
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [edx],          xmm2
+  movd    [edx+4],        xmm1
+  add             edx,            8
+  mov             [psad8x8],      edx                     ; sad8x8
+
+  paddd   xmm1,                           xmm2
+  movd    edx,                            xmm1
+  add             [tmp_sadframe],         edx                     ; iFrameSad
+
+  mov             edx,            [psum16x16]
+  movdqa  xmm1,           xmm6
+  pshufd  xmm2,           xmm1,           00001110b
+  paddd   xmm1,           xmm2
+  movd    [edx],          xmm1                            ; sum
+
+  mov             edx,            [p_sd8x8]
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [edx],          xmm1
+  add             edx,            8
+  mov             [p_sd8x8],      edx
+
+  mov                     edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm5
+  ;movdqa         xmm1,   xmm5
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm5,   xmm0
+  ;punpcklwd      xmm5,   xmm0
+  ;movd           [edx+4],        xmm5
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  mov                     [tmp_ecx],      ecx
+  movhlps         xmm1,   xmm5
+  movd            ecx,    xmm5
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+  psrlq   xmm7,   32
+  psllq   xmm7,   32                      ; clear sad
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+
+  mov             edx,            [psad8x8]
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [edx],          xmm2
+  movd    [edx+4],        xmm1
+  add             edx,            8
+  mov             [psad8x8],      edx                     ; sad8x8
+
+  paddd   xmm1,                           xmm2
+  movd    edx,                            xmm1
+  add             [tmp_sadframe],         edx                     ; iFrameSad
+
+  mov             edx,                    [psum16x16]
+  movdqa  xmm1,                   xmm6
+  pshufd  xmm2,                   xmm1,           00001110b
+  paddd   xmm1,                   xmm2
+  movd    ebp,                    xmm1                            ; sum
+  add             [edx],                  ebp
+  add             edx,                    4
+  mov             [psum16x16],    edx
+
+  mov             edx,                    [psqsum16x16]
+  psrlq   xmm7,                   32
+  pshufd  xmm2,                   xmm7,           00001110b
+  paddd   xmm2,                   xmm7
+  movd    [edx],                  xmm2                            ; sqsum
+  add             edx,                    4
+  mov             [psqsum16x16],  edx
+
+  mov             edx,            [p_sd8x8]
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [edx],          xmm1
+  add             edx,            8
+  mov             [p_sd8x8],      edx
+
+  mov             edx,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm5
+  ;movdqa         xmm1,   xmm5
+  ;punpcklbw      xmm1,   xmm0
+  ;punpcklwd      xmm1,   xmm0
+  ;movd           [edx],  xmm1
+  ;punpckhbw      xmm5,   xmm0
+  ;punpcklwd      xmm5,   xmm0
+  ;movd           [edx+4],        xmm5
+  ;add                    edx,            8
+  ;mov                    [p_mad8x8],     edx
+  movhlps         xmm1,   xmm5
+  movd            ecx,    xmm5
+  mov                     [edx],  cl
+  movd            ecx,    xmm1
+  mov                     [edx+1],cl
+  add                     edx,    2
+  mov                     [p_mad8x8],     edx
+
+  mov             edx,            [psqdiff16x16]
+  pshufd  xmm1,           xmm4,           00001110b
+  paddd   xmm4,           xmm1
+  pshufd  xmm1,           xmm4,           00000001b
+  paddd   xmm4,           xmm1
+  movd    [edx],          xmm4
+  add             edx,            4
+  mov             [psqdiff16x16], edx
+
+  add             edx,    16
+  sub             esi,    eax
+  sub             edi,    eax
+  add             esi,    16
+  add             edi,    16
+
+  mov             ecx,    [tmp_ecx]
+  dec             ecx
+  jnz             sqdiff_bgd_width_loop
+
+  mov             esi,    [tmp_esi]
+  mov             edi,    [tmp_edi]
+  add             esi,    eax
+  add             edi,    eax
+
+  dec     dword [iPicHeight]
+  jnz             sqdiff_bgd_height_loop
+
+  mov             edx,    [psadframe]
+  mov             ebp,    [tmp_sadframe]
+  mov             [edx],  ebp
+
+  add             esp,    localsize
+  pop             ebx
+  pop             edi
+  pop             esi
+  pop             ebp
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+   ret
+%else
+
 WELS_EXTERN VAACalcSadBgd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;                               int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
 ;*************************************************************************************************************
 
 
 ALIGN 16
 VAACalcSadBgd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		p_sd8x8				esp + pushsize + localsize + 32
-%define		p_mad8x8			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_ecx				esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
+%define         cur_data                        arg1;
+%define         ref_data                        arg2;
+%define         iPicWidth                       arg3;
+%define         iPicHeight                      arg4;
+%define         iPicStride                      arg5;
+%define         psadframe                       arg6;
+%define         psad8x8                         arg7;
+%define         p_sd8x8                         arg8;
+%define         p_mad8x8                        arg9;
 
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	xor		ebp,	ebp
-	pxor	xmm0,	xmm0
+  push r12
+  push r13
+  push r14
+  push r15
+%assign push_num 4
+%ifdef WIN64
+  mov r4,arg5
+  ;  mov r5,arg6
+%endif
+  mov r14,arg7
+  SIGN_EXTENTION r2,r2d
+  SIGN_EXTENTION r3,r3d
+  SIGN_EXTENTION r4,r4d
+
+
+  mov     r13,r4
+  mov     r15,r0
+  shr     r2,4
+  shr     r3,4
+  shl     r13,4
+  pxor    xmm0,   xmm0
+  pxor    xmm8,   xmm8
+  pxor    xmm9,   xmm9
 bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
+  ;mov            ecx,    dword [iPicWidth]
+  push r2
+  %assign push_num push_num+1
+  mov             r10,    r15
+  mov             r11,    r1
 bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8
-	pxor	xmm6,	xmm6		; sum_cur_8x8
-	pxor	xmm5,	xmm5		; sum_ref_8x8
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+  pxor    xmm7,   xmm7            ; pSad8x8
+  pxor    xmm6,   xmm6            ; sum_cur_8x8
+  pxor    xmm5,   xmm5            ; sum_ref_8x8
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
 
 
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
+  mov                     r14,            p_mad8x8
+  WELS_MAX_REG_SSE2       xmm4
 
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
+  ;mov                    [tmp_ecx],      ecx
+  movhlps         xmm1,   xmm4
+  movd            r0d,    xmm4
 
 
-	pslldq		xmm7,	4
-	pslldq		xmm6,	4
-	pslldq		xmm5,	4
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  add                     r14,    2
+  ;mov                     p_mad8x8,       r14
 
 
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+  pslldq          xmm7,   4
+  pslldq          xmm6,   4
+  pslldq          xmm5,   4
 
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
 
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
+  pxor    xmm4,   xmm4            ; pMad8x8
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
 
-	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+  ;mov                     r14,            [p_mad8x8]
+  WELS_MAX_REG_SSE2       xmm4
 
-	mov		edx,	[psad8x8]
-	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[psad8x8],	edx					; sad8x8
+  movhlps         xmm1,   xmm4
+  movd            r0d,    xmm4
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  add                     r14,    2
+  mov                     p_mad8x8,       r14
 
-	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
-	pshufd	xmm2,	xmm1,	00000011b
-	paddd	xmm1,	xmm2
-	movd	edx,	xmm1
-	add		ebp,	edx						; sad frame
+  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
 
-	mov		edx,	[p_sd8x8]
-	psubd	xmm6,	xmm5
-	pshufd	xmm1,	xmm6,	10001101b
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[p_sd8x8],	edx
+  mov             r14,    psad8x8
+  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+  movdqa  [r14],  xmm1
+  add             r14,    16
+  mov             psad8x8,        r14                                     ; sad8x8
 
+  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+  pshufd  xmm2,   xmm1,   00000011b
+  paddd   xmm1,   xmm2
+  movd    r14d,   xmm1
+  movd    xmm9, r14d
+  paddd   xmm8,   xmm9                                            ; sad frame
 
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
+  mov             r14,    p_sd8x8
+  psubd   xmm6,   xmm5
+  pshufd  xmm1,   xmm6,   10001101b
+  movdqa  [r14],  xmm1
+  add             r14,    16
+  mov             p_sd8x8,        r14
 
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		bgd_width_loop
 
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
+  ;add            edx,    16
+  sub             r15,    r13
+  sub             r1,     r13
+  add             r15,    16
+  add             r1,     16
 
-	dec		dword [iPicHeight]
-	jnz		bgd_height_loop
 
-	mov		edx,	[psadframe]
-	mov		[edx],	ebp
+  dec             r2
+  jnz             bgd_width_loop
+  pop     r2
+%assign push_num push_num-1
+  mov             r15,    r10
+  mov             r1,     r11
+  add             r15,    r13
+  add             r1,     r13
 
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
+  dec             r3
+  jnz             bgd_height_loop
 
+  mov             r13,    psadframe
+  movd    [r13],  xmm8
 
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+%assign push_num 0
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
 
+
+
 WELS_EXTERN VAACalcSadSsdBgd_sse2
 ;*************************************************************************************************************
 ;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;                int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;                       int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
 ;*************************************************************************************************************
 
 
 ALIGN 16
 VAACalcSadSsdBgd_sse2:
-%define		localsize		16
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		p_sd8x8				esp + pushsize + localsize + 44
-%define		p_mad8x8			esp + pushsize + localsize + 48
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		tmp_ecx				esp + 12
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
+%define         cur_data                        arg1;
+%define         ref_data                        arg2;
+%define         iPicWidth                       arg3;
+%define         iPicHeight                      arg4;
+%define         iPicStride                      arg5;
+%define         psadframe                       arg6;
+%define         psad8x8                         arg7;
+%define         psum16x16                       arg8;
+%define         psqsum16x16                     arg9;
+%define         psqdiff16x16                    arg10;
+%ifdef WIN64
+%define         p_sd8x8                         [rsp + push_num*8 + 88];
+%define         p_mad8x8                        [rsp + push_num*8 + 96];
+%else ;linux
+%define         p_sd8x8                         [rsp + push_num*8 + 40];
+%define         p_mad8x8                        [rsp + push_num*8 + 48];
+%endif
 
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
+  push r12
+  push r13
+  push r14
+  push r15
+%assign push_num 4
+%ifdef WIN64
+  mov r4,arg5
+  ;mov r5,arg6
+%endif
+  SIGN_EXTENTION r2,r2d
+  SIGN_EXTENTION r3,r3d
+  SIGN_EXTENTION r4,r4d
+
+  mov     r13,r4
+  shr             r2,     4                                       ; iPicWidth/16
+  shr             r3,     4                                       ; iPicHeight/16
+  shl             r13,    4                                                       ; iPicStride*16
+  pxor    xmm0,   xmm0
+  pxor    xmm8,   xmm8
+  pxor    xmm9,   xmm9
+
+
 sqdiff_bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
+  mov             r10,    r0
+  mov             r11,    r1
+  push r2
+%assign push_num push_num+1
 sqdiff_bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
+  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
 
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
+  mov             r14,            psad8x8
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [r14],          xmm2
+  movd    [r14+4],        xmm1
+  add             r14,            8
+  mov             psad8x8,        r14                     ; sad8x8
 
-	mov		edx,		[psum16x16]
-	movdqa	xmm1,		xmm6
-	pshufd	xmm2,		xmm1,		00001110b
-	paddd	xmm1,		xmm2
-	movd	[edx],		xmm1				; sum
+  paddd   xmm1,                           xmm2
+  movd    r14d,                           xmm1
+  movd    xmm9,r14d
+  paddd           xmm8,           xmm9                    ; iFrameSad
 
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
+  mov             r14,            psum16x16
+  movdqa  xmm1,           xmm6
+  pshufd  xmm2,           xmm1,           00001110b
+  paddd   xmm1,           xmm2
+  movd    [r14],          xmm1                            ; sum
 
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
+  mov             r14,            p_sd8x8
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [r14],          xmm1
+  add             r14,            8
+  mov             p_sd8x8,        r14
 
-	psrlq	xmm7,	32
-	psllq	xmm7,	32			; clear sad
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+  mov                     r14,            p_mad8x8
+  WELS_MAX_REG_SSE2       xmm5
 
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
+  movhlps         xmm1,   xmm5
+  push r0
+  movd            r0d,    xmm5
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  pop r0
+  add                     r14,    2
+  mov                     p_mad8x8,       r14
 
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
+  psrlq   xmm7,   32
+  psllq   xmm7,   32                      ; clear sad
+  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+  pxor    xmm5,   xmm5            ; pMad8x8
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
 
-	mov		edx,			[psum16x16]
-	movdqa	xmm1,			xmm6
-	pshufd	xmm2,			xmm1,		00001110b
-	paddd	xmm1,			xmm2
-	movd	ebp,			xmm1				; sum
-	add		[edx],			ebp
-	add		edx,			4
-	mov		[psum16x16],	edx
+  mov             r14,            psad8x8
+  movdqa  xmm2,           xmm7
+  pshufd  xmm1,           xmm2,           00001110b
+  movd    [r14],          xmm2
+  movd    [r14+4],        xmm1
+  add             r14,            8
+  mov             psad8x8,        r14                     ; sad8x8
 
-	mov		edx,			[psqsum16x16]
-	psrlq	xmm7,			32
-	pshufd	xmm2,			xmm7,		00001110b
-	paddd	xmm2,			xmm7
-	movd	[edx],			xmm2				; sqsum
-	add		edx,			4
-	mov		[psqsum16x16],	edx
+  paddd   xmm1,                           xmm2
+  movd    r14d,                           xmm1
+  movd    xmm9, r14d
+  paddd   xmm8,           xmm9            ; iFrameSad
 
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
+  mov             r14,                    psum16x16
+  movdqa  xmm1,                   xmm6
+  pshufd  xmm2,                   xmm1,           00001110b
+  paddd   xmm1,                   xmm2
+  movd    r15d,                   xmm1                            ; sum
+  add             [r14],                  r15d
+  add             r14,                    4
+  mov             psum16x16,      r14
 
-	mov		edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
+  mov             r14,                    psqsum16x16
+  psrlq   xmm7,                   32
+  pshufd  xmm2,                   xmm7,           00001110b
+  paddd   xmm2,                   xmm7
+  movd    [r14],                  xmm2                            ; sqsum
+  add             r14,                    4
+  mov             psqsum16x16,    r14
 
-	mov		edx,		[psqdiff16x16]
-	pshufd	xmm1,		xmm4,		00001110b
-	paddd	xmm4,		xmm1
-	pshufd	xmm1,		xmm4,		00000001b
-	paddd	xmm4,		xmm1
-	movd	[edx],		xmm4
-	add		edx,		4
-	mov		[psqdiff16x16],	edx
+  mov             r14,            p_sd8x8
+  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+  movq    [r14],          xmm1
+  add             r14,            8
+  mov             p_sd8x8,        r14
 
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
+  mov             r14,            p_mad8x8
+  WELS_MAX_REG_SSE2       xmm5
 
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		sqdiff_bgd_width_loop
 
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
+  movhlps         xmm1,   xmm5
+  push r0
+  movd            r0d,    xmm5
+  mov                     [r14],  r0b
+  movd            r0d,    xmm1
+  mov                     [r14+1],r0b
+  pop r0
+  add                     r14,    2
+  mov                     p_mad8x8,       r14
 
-	dec	dword [iPicHeight]
-	jnz		sqdiff_bgd_height_loop
+  mov             r14,            psqdiff16x16
+  pshufd  xmm1,           xmm4,           00001110b
+  paddd   xmm4,           xmm1
+  pshufd  xmm1,           xmm4,           00000001b
+  paddd   xmm4,           xmm1
+  movd    [r14],          xmm4
+  add             r14,            4
+  mov             psqdiff16x16,   r14
 
-	mov		edx,	[psadframe]
-	mov		ebp,	[tmp_sadframe]
-	mov		[edx],	ebp
+  add             r14,    16
+  sub             r0,     r13
+  sub             r1,     r13
+  add             r0,     16
+  add             r1,     16
 
-	add		esp,	localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
+  dec             r2
+  jnz             sqdiff_bgd_width_loop
+  pop r2
+  %assign push_num push_num-1
+  mov             r0,     r10
+  mov             r1,     r11
+  add             r0,     r13
+  add             r1,     r13
+
+  dec     r3
+  jnz             sqdiff_bgd_height_loop
+
+  mov             r14,    psadframe
+  movd    [r14],  xmm8
+
+  pop r15
+  pop r14
+  pop r13
+  pop r12
+%assign push_num 0
+%undef          cur_data
+%undef          ref_data
+%undef          iPicWidth
+%undef          iPicHeight
+%undef          iPicStride
+%undef          psadframe
+%undef          psad8x8
+%undef          psum16x16
+%undef          psqsum16x16
+%undef          psqdiff16x16
+%undef          p_sd8x8
+%undef          p_mad8x8
+%undef          tmp_esi
+%undef          tmp_edi
+%undef          pushsize
+%undef          localsize
+  ret
 %endif
--- a/codec/processing/src/common/cpu.cpp
+++ /dev/null
@@ -1,196 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	cpu.c
- *
- * \brief	CPU compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#include "util.h"
-#include "cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define    CPU_Vender_AMD    "AuthenticAMD"
-#define    CPU_Vender_INTEL  "GenuineIntel"
-#define    CPU_Vender_CYRIX  "CyrixInstead"
-
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
-  uint32_t uiCPU = 0;
-  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
-  int32_t  CacheLineSize = 0;
-  int8_t   chVenderName[16] = { 0 };
-
-  if (!WelsCPUIdVerify()) {
-    /* cpuid is not supported in cpu */
-    return 0;
-  }
-
-  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
-  if (uiFeatureA == 0) {
-    /* maximum input value for basic cpuid information */
-    return 0;
-  }
-
-  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-  if ((uiFeatureD & 0x00800000) == 0) {
-    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
-    return 0;
-  }
-
-  uiCPU = WELS_CPU_MMX;
-  if (uiFeatureD & 0x02000000) {
-    /* SSE technology is identical to AMD MMX extensions */
-    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
-  }
-  if (uiFeatureD & 0x04000000) {
-    /* SSE2 support here */
-    uiCPU |= WELS_CPU_SSE2;
-  }
-  if (uiFeatureD & 0x00000001) {
-    /* x87 FPU on-chip checking */
-    uiCPU |= WELS_CPU_FPU;
-  }
-  if (uiFeatureD & 0x00008000) {
-    /* CMOV instruction checking */
-    uiCPU |= WELS_CPU_CMOV;
-  }
-  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
-    if (uiFeatureD & 0x10000000) {
-      /* Multi-Threading checking: contains of multiple logic processors */
-      uiCPU |= WELS_CPU_HTT;
-    }
-  }
-
-  if (uiFeatureC & 0x00000001) {
-    /* SSE3 support here */
-    uiCPU |= WELS_CPU_SSE3;
-  }
-  if (uiFeatureC & 0x00000200) {
-    /* SSSE3 support here */
-    uiCPU |= WELS_CPU_SSSE3;
-  }
-  if (uiFeatureC & 0x00080000) {
-    /* SSE4.1 support here, 45nm Penryn processor */
-    uiCPU |= WELS_CPU_SSE41;
-  }
-  if (uiFeatureC & 0x00100000) {
-    /* SSE4.2 support here, next generation Nehalem processor */
-    uiCPU |= WELS_CPU_SSE42;
-  }
-  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {	//
-    /* AVX supported */
-    uiCPU |= WELS_CPU_AVX;
-  }
-  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {	//
-    /* AVX FMA supported */
-    uiCPU |= WELS_CPU_FMA;
-  }
-  if (uiFeatureC & 0x02000000) {
-    /* AES checking */
-    uiCPU |= WELS_CPU_AES;
-  }
-  if (uiFeatureC & 0x00400000) {
-    /* MOVBE checking */
-    uiCPU |= WELS_CPU_MOVBE;
-  }
-
-  if (pNumberOfLogicProcessors != NULL) {
-    // HTT enabled on chip
-    *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
-  }
-
-  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
-      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
-    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-    if (uiFeatureD & 0x00400000) {
-      uiCPU |= WELS_CPU_MMXEXT;
-    }
-    if (uiFeatureD & 0x80000000) {
-      uiCPU |= WELS_CPU_3DNOW;
-    }
-  }
-
-  if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) {	// confirmed_safe_unsafe_usage
-    int32_t  family, model;
-
-    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
-    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
-
-    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
-      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
-    }
-  }
-
-  // get cache line size
-  if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
-      || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) {	// confirmed_safe_unsafe_usage
-    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-    CacheLineSize = (uiFeatureB & 0xff00) >>
-                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
-    if (CacheLineSize == 128) {
-      uiCPU |= WELS_CPU_CACHELINE_128;
-    } else if (CacheLineSize == 64) {
-      uiCPU |= WELS_CPU_CACHELINE_64;
-    } else if (CacheLineSize == 32) {
-      uiCPU |= WELS_CPU_CACHELINE_32;
-    } else if (CacheLineSize == 16) {
-      uiCPU |= WELS_CPU_CACHELINE_16;
-    }
-  }
-
-  return uiCPU;
-}
-
-
-void WelsCPURestore (const uint32_t kuiCPU) {
-  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
-    WelsEmms();
-  }
-}
-
-#endif
-
-
-WELSVP_NAMESPACE_END
-
-
--- a/codec/processing/src/common/cpu.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- * \file	cpu.h
- *
- * \brief	CPU feature compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_CPU_H
-#define WELSVP_CPU_H
-
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-/*
- *	WELS CPU feature flags
- */
-#define WELS_CPU_MMX        0x00000001    /* mmx */
-#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
-#define WELS_CPU_SSE        0x00000004    /* sse */
-#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
-#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
-#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
-#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
-#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
-#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
-#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
-#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
-
-/* CPU features application extensive */
-#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
-#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
-#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
-										   physical processor package is capable of supporting more than one logic processor
-										*/
-#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
-										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
-										*/
-#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
-#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
-#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
-
-#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
-#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
-#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
-#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
-
-/*
- *	Interfaces for CPU core feature detection as below
- */
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-
-int32_t WelsCPUIdVerify();
-
-void  WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
-int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
-int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
-
-void  WelsEmms();
-
-WELSVP_EXTERN_C_END
-#endif
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -58,11 +58,11 @@
   sVaaFuncs.pfVAACalcSadVar			= VAACalcSadVar_c;
 #ifdef X86_ASM
   if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
-    /* sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
-     sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
-     sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
-     sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
-     sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;*/
+    sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
+    sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
+    sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
+    sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;
   }
 #endif//X86_ASM
 }
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -3,7 +3,6 @@
 PROCESSING_CPP_SRCS=\
 	$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp\
 	$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp\
-	$(PROCESSING_SRCDIR)/./src/common/cpu.cpp\
 	$(PROCESSING_SRCDIR)/./src/common/memory.cpp\
 	$(PROCESSING_SRCDIR)/./src/common/thread.cpp\
 	$(PROCESSING_SRCDIR)/./src/common/util.cpp\