shithub: openh264

Download patch

ref: 501c77f66b73f207c7b0776aba8991b06861cc78
parent: 94cabe10d54021c8269d51ba3fa5d88c4a0607fe
parent: 90fc914b6c7193223a7705ac5ca36e2558321751
author: volvet <qizh@cisco.com>
date: Mon Apr 7 17:47:23 EDT 2014

Merge pull request #637 from zhilwang/ruby-merge

Ruby merge

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -81,6 +81,7 @@
 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
 		4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
 		4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
+		4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
 		4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
 		4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -281,6 +282,7 @@
 		4CE446A918BC605C0017DF25 /* inc */ = {
 			isa = PBXGroup;
 			children = (
+				4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */,
 				4CE446AA18BC605C0017DF25 /* as264_common.h */,
 				4CE446AB18BC605C0017DF25 /* au_set.h */,
 				4CE446AC18BC605C0017DF25 /* bit_stream.h */,
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -199,11 +199,24 @@
                         const int32_t kiEncStride, const int32_t kiRefStride,
                         const int32_t kiMinPos, const int32_t kiMaxPos,
                         const bool bVerticalSearch );
+#ifdef X86_ASM
+extern "C"
+{
+uint32_t SampleSad8x8Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+uint32_t SampleSad16x16Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+}
+
 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
                             uint16_t* pMvdTable, const int32_t kiFixedMvd,
                             const int32_t kiEncStride, const int32_t kiRefStride,
                           const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch );
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                                      const int32_t kiEncStride, const int32_t kiRefStride,
+                                      const int32_t kiMinPos, const int32_t kiMaxPos,
+                                      const bool bVerticalSearch );
+#endif
 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);
 
 // Feature Search Basics
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@@ -87,6 +87,7 @@
 #define PARA_SET_TYPE_SUBSETSPS	1
 #define PARA_SET_TYPE_PPS		2
 
+#define MAX_VERTICAL_MV_RANGE   1024  //TODO, for allocate enough memory for transpose
 #define MAX_FRAME_RATE			30	// maximal frame rate to support
 #define MIN_FRAME_RATE			1	// minimal frame rate need support
 
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -134,6 +134,7 @@
 typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
     uint8_t*, uint8_t*);
 
+typedef uint32_t (*PSampleSadHor8Func)( uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t* );
 typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,
                                    void* pSlice);
 typedef void (*PSearchMethodFunc) (SWelsFuncPtrList* pFuncList, void* pMe, void* pSlice, const int32_t kiEncStride, const int32_t kiRefStride);
@@ -202,14 +203,16 @@
   PGetIntraPredFunc     pfGetLumaI4x4Pred[I4_PRED_A];
   PGetIntraPredFunc     pfGetChromaPred[C_PRED_A];
 
+  PSampleSadHor8Func	pfSampleSadHor8[2];	// 1: for 16x16 square; 0: for 8x8 square
   PMotionSearchFunc
   pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
   PSearchMethodFunc pfSearchMethod[BLOCK_SIZE_ALL];
   PCalculateSatdFunc pfCalculateSatd;
   PCheckDirectionalMv pfCheckDirectionalMv;
-  PLineFullSearchFunc pfLineFullSearch;
   PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16
   PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16
+  PLineFullSearchFunc pfVerticalFullSearch;
+  PLineFullSearchFunc pfHorizontalFullSearch;
 
   PCopyFunc      pfCopy16x16Aligned;    //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
   PCopyFunc      pfCopy16x16NotAligned;  //md.c
--- /dev/null
+++ b/codec/encoder/core/inc/wels_transpose_matrix.h
@@ -1,0 +1,54 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+#define WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+
+#ifdef X86_ASM
+extern "C"
+{
+void TransposeMatrixBlocksx16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock16x16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+void TransposeMatrixBlocksx8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock8x8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+}
+#endif
+
+typedef void (*PTransposeMatrixBlockFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+typedef void (*PTransposeMatrixBlocksFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+
+}// end of namespace declaration
+
+#endif//WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -41,6 +41,7 @@
 #include "cpu_core.h"
 #include "ls_defines.h"
 #include "svc_motion_estimate.h"
+#include "wels_transpose_matrix.h"
 
 namespace WelsSVCEnc {
 
@@ -65,8 +66,14 @@
     pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;
 
     //for cross serarch
-    pFuncList->pfLineFullSearch = LineFullSearch_c;
+    pFuncList->pfVerticalFullSearch = LineFullSearch_c;
+    pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
+#if defined (X86_ASM)
     if ( uiCpuFlag & WELS_CPU_SSE41 ) {
+      pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
+      pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
+      pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
+      pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
     }
 
     //for feature search
@@ -75,6 +82,7 @@
     //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
     pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
     pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#endif
   }
 }
 
@@ -302,6 +310,17 @@
 /////////////////////////
 // Cross Search Basics
 /////////////////////////
+#if defined (X86_ASM)
+void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
+{
+  uint16_t *pBaseCost		= pMvdCost;
+  const int32_t kiOffset	= (kiStartMv<<2);
+  uint16_t *pMvd		= pMvdTable+kiOffset;
+  for (int32_t i = 0; i < 8; ++ i) {
+    pBaseCost[i] = ((*pMvd) + kiFixedCost);
+    pMvd += 4;
+  }
+}
 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
                             uint16_t* pMvdTable, const int32_t kiFixedMvd,
                             const int32_t kiEncStride, const int32_t kiRefStride,
@@ -308,12 +327,130 @@
                           const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch ) {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
-  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
+  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
+  uint8_t*  kpEncMb	= pMe->pEncMb;
+  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixY;
+  uint8_t* pRef			      = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
+  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;
+  const int32_t kiEdgeBlocks	= kIsBlock16x16 ? 16 : 8;
+  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+  PTransposeMatrixBlockFunc	TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
+  PTransposeMatrixBlocksFunc	TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
+
+  const int32_t kiDiff			= kiMaxPos - kiMinPos;
+  const int32_t kiRowNum		= WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
+  const int32_t kiBlocksNum		= kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
+  int32_t iCountLoop8		= (kiRowNum-kiEdgeBlocks) >> 3;
+  const int32_t kiRemainingVectors		= kiDiff - (iCountLoop8<<3);
+  const int32_t kiMatrixStride		= MAX_VERTICAL_MV_RANGE;
+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 );	// transpose matrix result for ref
+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 );				// transpose matrix result for enc
+  assert(kiRowNum <= kiMatrixStride);	// make sure effective memory
+
+  TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
+  TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
+  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+  int32_t iTargetPos			= kiMinPos;
+  int16_t iBestPos				= pMe->sMv.iMvX;
+  uint32_t uiBestCost			= pMe->uiSadCost;
+  uint32_t uiCostMin;
+  int32_t iIndexMinPos;
+  kpEncMb	= &uiMatrixEnc[0][0];
+  pRef	= &uiMatrixRef[0][0];
+
+  while(iCountLoop8 > 0) {
+    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+    uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
+    if (uiCostMin < uiBestCost) {
+      uiBestCost	= uiCostMin;
+      iBestPos		= iTargetPos+iIndexMinPos;
+    }
+    iTargetPos	+= 8;
+    pRef += 8;
+    -- iCountLoop8;
+  }
+  if (kiRemainingVectors > 0) {
+    kpEncMb	= pMe->pEncMb;
+    pRef	= &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
+    while (iTargetPos < kiMaxPos) {
+      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];
+      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+      if (uiSadCost < uiBestCost) {
+        uiBestCost	= uiSadCost;
+        iBestPos	= iTargetPos;
+      }
+      pRef += kiRefStride;
+      ++iTargetPos;
+    }
+  }
+  if (uiBestCost < pMe->uiSadCost) {
+    SMVUnitXY sBestMv;
+    sBestMv.iMvX = 0;
+    sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
+    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );
+  }
 }
-void LineFullSearch_c(  void *pFunc, void *vpMe,
-                          uint16_t* pMvdTable, const int32_t kiFixedMvd,
-                          const int32_t kiEncStride, const int32_t kiRefStride,
-                          const int32_t kiMinPos, const int32_t kiMaxPos,
+
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                                      const int32_t kiEncStride, const int32_t kiRefStride,
+                                      const int32_t kiMinPos, const int32_t kiMaxPos,
+                                      const bool bVerticalSearch )
+{
+  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
+  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
+  uint8_t *kpEncMb	= pMe->pEncMb;
+  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixX;
+  uint8_t *pRef			      = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
+  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;
+  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+  const int32_t kiNumVector	= kiMaxPos - kiMinPos;
+  int32_t iCountLoop8	= kiNumVector >> 3;
+  const int32_t kiRemainingLoop8	= kiNumVector & 7;
+  int32_t iTargetPos			= kiMinPos;
+  int16_t iBestPos				= pMe->sMv.iMvX;
+  uint32_t uiBestCost			= pMe->uiSadCost;
+  uint32_t uiCostMin;
+  int32_t iIndexMinPos;
+
+  while(iCountLoop8 > 0) {
+    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+    uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
+    if (uiCostMin < uiBestCost) {
+      uiBestCost	= uiCostMin;
+      iBestPos		= iTargetPos+iIndexMinPos;
+    }
+    iTargetPos	+= 8;
+    pRef += 8;
+    -- iCountLoop8;
+  }
+  if ( kiRemainingLoop8 > 0 ) {
+    while (iTargetPos < kiMaxPos) {
+      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];
+      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+      if (uiSadCost < uiBestCost) {
+        uiBestCost	= uiSadCost;
+        iBestPos	= iTargetPos;
+      }
+      ++pRef;
+      ++iTargetPos;
+    }
+  }
+  if (uiBestCost < pMe->uiSadCost) {
+    SMVUnitXY sBestMv;
+    sBestMv.iMvX = iBestPos - kiCurMeBlockPix;
+    sBestMv.iMvY = 0;
+    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );
+  }
+}
+#endif
+void LineFullSearch_c(	void *pFunc, void *vpMe,
+													uint16_t* pMvdTable, const int32_t kiFixedMvd,
+													const int32_t kiEncStride, const int32_t kiRefStride,
+													const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch ) {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
   SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
@@ -346,8 +483,8 @@
 
 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SWelsME * pMe,
 											const SSlice* pSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {
-  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfLineFullSearch;
-  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfLineFullSearch;
+  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfVerticalFullSearch;
+  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfHorizontalFullSearch;
 
   const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
   const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
--- /dev/null
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -1,0 +1,395 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;in:  m0, m1, m2, m3, m4, m5, m6, m7
+;out: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE_8x8B_MMX 10
+	MMX_XSwap bw,  %1, %2, %8
+	MMX_XSwap bw,  %3, %4, %2
+	MMX_XSwap bw,  %5, %6, %4
+	movq	%6, %9
+	movq	%10, %4
+	MMX_XSwap bw,  %7, %6, %4
+
+	MMX_XSwap wd,  %1, %3, %6
+	MMX_XSwap wd,  %8, %2, %3
+	MMX_XSwap wd,  %5, %7, %2
+	movq	%7, %10
+	movq	%10, %3
+	MMX_XSwap wd,  %7, %4, %3
+
+	MMX_XSwap dq,  %1, %5, %4
+	MMX_XSwap dq,  %6, %2, %5
+	MMX_XSwap dq,  %8, %7, %2
+	movq	%7, %10
+	movq	%10, %5
+	MMX_XSwap dq,  %7, %3, %5
+
+	movq	%3, %10
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_MMX 2	; dst, dst_stride
+	movq [%1], mm0			; result of line 1, x8 bytes
+	movq [%1+%2], mm3		; result of line 2
+	lea %1, [%1+2*%2]
+	movq [%1], mm5			; result of line 3
+	movq [%1+%2], mm2		; result of line 4
+	lea %1, [%1+2*%2]
+	movq [%1], mm7			; result of line 5
+	movq [%1+%2], mm1		; result of line 6
+	lea %1, [%1+2*%2]
+	movq [%1], mm6			; result of line 7
+	movq [%1+%2], mm4		; result of line 8
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3	; dst, dst_stride, reg32
+	movq [%1], mm0			; result of line 1, x8 bytes
+	movq [%1+%2], mm3		; result of line 2
+	lea %3, [%1+2*%2]
+	movq [%3], mm5			; result of line 3
+	movq [%3+%2], mm2		; result of line 4
+	lea %3, [%3+2*%2]
+	movq [%3], mm7			; result of line 5
+	movq [%3+%2], mm1		; result of line 6
+	lea %3, [%3+2*%2]
+	movq [%3], mm6			; result of line 7
+	movq [%3+%2], mm4		; result of line 8
+%endmacro	; end of TRANSPOSE8x8_WRITE_ALT_MMX
+
+; for transpose 16x8
+
+;in:  m0, m1, m2, m3, m4, m5, m6, m7
+;out: m4, m2, m3, m7, m5, m1, m6, m0
+%macro TRANSPOSE_8x16B_SSE2		10
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%10, %4
+	SSE2_XSawp bw,  %7, %6, %4
+
+	SSE2_XSawp wd,  %1, %3, %6
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %10
+	movdqa	%10, %3
+	SSE2_XSawp wd,  %7, %4, %3
+
+	SSE2_XSawp dq,  %1, %5, %4
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %10
+	movdqa	%10, %5
+	SSE2_XSawp dq,  %7, %3, %5
+
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %10
+	movdqa	%10, %1
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %10
+%endmacro	; end of TRANSPOSE_8x16B_SSE2
+
+
+%macro TRANSPOSE8x16_WRITE_SSE2	2	; dst, dst_stride
+	movq [%1], xmm4			; result of line 1, x8 bytes
+	movq [%1+%2], xmm2		; result of line 2
+	lea %1, [%1+2*%2]
+	movq [%1], xmm3			; result of line 3
+	movq [%1+%2], xmm7		; result of line 4
+
+	lea %1, [%1+2*%2]
+	movq [%1], xmm5			; result of line 5
+	movq [%1+%2], xmm1		; result of line 6
+	lea %1, [%1+2*%2]
+	movq [%1], xmm6			; result of line 7
+	movq [%1+%2], xmm0		; result of line 8
+
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm4		; result of line 9
+	movhpd [%1+%2], xmm2	; result of line 10
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm3		; result of line 11
+	movhpd [%1+%2], xmm7	; result of line 12
+
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm5		; result of line 13
+	movhpd [%1+%2], xmm1	; result of line 14
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm6		; result of line 15
+	movhpd [%1+%2], xmm0	; result of line 16
+%endmacro	; end of TRANSPOSE_WRITE_RESULT_SSE2
+
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2	3	; dst, dst_stride, reg32
+	movq [%1], xmm4			; result of line 1, x8 bytes
+	movq [%1+%2], xmm2		; result of line 2
+	lea %3, [%1+2*%2]
+	movq [%3], xmm3			; result of line 3
+	movq [%3+%2], xmm7		; result of line 4
+
+	lea %3, [%3+2*%2]
+	movq [%3], xmm5			; result of line 5
+	movq [%3+%2], xmm1		; result of line 6
+	lea %3, [%3+2*%2]
+	movq [%3], xmm6			; result of line 7
+	movq [%3+%2], xmm0		; result of line 8
+
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm4		; result of line 9
+	movhpd [%3+%2], xmm2	; result of line 10
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm3		; result of line 11
+	movhpd [%3+%2], xmm7	; result of line 12
+
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm5		; result of line 13
+	movhpd [%3+%2], xmm1	; result of line 14
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm6		; result of line 15
+	movhpd [%3+%2], xmm0	; result of line 16
+%endmacro	; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+
+
+SECTION .text
+
+WELS_EXTERN TransposeMatrixBlock16x16_sse2
+; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
+	push r4
+	push r5
+	%assign push_num 2
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+
+	mov r4, r7
+	and r4, 0Fh
+	sub r7, 10h
+	sub r7, r4
+	lea r5, [r3+r3*2]
+	; top 8x16 block
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+r3*2]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+r3*4]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+r3*2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+
+	; bottom 8x16 block
+	lea	r2, [r2+r3*4]
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+r3*2]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+r3*4]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+r3*2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+	mov r5, r1
+	sal r5, 4
+	sub r0, r5
+	lea r0, [r0+r1*2+8]
+	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+
+	add r7, r4
+	add r7, 10h
+	POP_XMM
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+WELS_EXTERN TransposeMatrixBlocksx16_sse2
+; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
+	push r5
+	push r6
+	%assign push_num 2
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION  r1, r1d
+	SIGN_EXTENSION  r3, r3d
+	SIGN_EXTENSION  r4, r4d
+	mov r5, r7
+	and r5, 0Fh
+	sub r7, 10h
+	sub r7, r5
+TRANSPOSE_LOOP_SSE2:
+	; explictly loading next loop data
+	lea	r6, [r2+r3*8]
+	push r4
+%rep 8
+	mov	r4, [r6]
+	mov	r4, [r6+r3]
+	lea	r6, [r6+r3*2]
+%endrep
+	pop r4
+	; top 8x16 block
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm2, [r2]
+	movdqa xmm3, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+	TRANSPOSE8x16_WRITE_ALT_SSE2		r0, r1, r6
+	lea	r2, [r2+r3*2]
+
+	; bottom 8x16 block
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	lea	r2, [r2+r3*2]
+	movdqa xmm2, [r2]
+	movdqa xmm3, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	lea	r2, [r2+r3*2]
+	movdqa xmm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+	TRANSPOSE8x16_WRITE_ALT_SSE2		r0+8, r1, r6
+	lea	r2, [r2+r3*2]
+	lea r0, [r0+16]
+	dec r4
+	jg near TRANSPOSE_LOOP_SSE2
+
+	add r7, r5
+	add r7, 10h
+	POP_XMM
+	LOAD_5_PARA_POP
+	pop r6
+	pop r5
+	ret
+
+WELS_EXTERN TransposeMatrixBlock8x8_mmx
+; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
+	%assign push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENSION  r1, r1d
+	SIGN_EXTENSION  r3, r3d
+	sub	r7, 8
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m0, m3, m5, m2, m7, m1, m6, m4
+	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+	TRANSPOSE8x8_WRITE_MMX r0, r1
+
+	emms
+	add r7, 8
+	LOAD_4_PARA_POP
+	ret
+
+WELS_EXTERN TransposeMatrixBlocksx8_mmx
+; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
+	push r5
+	push r6
+	%assign push_num 2
+	LOAD_5_PARA
+	SIGN_EXTENSION  r1, r1d
+	SIGN_EXTENSION  r3, r3d
+	SIGN_EXTENSION  r4, r4d
+	sub	r7, 8
+
+	lea	r5, [r2+r3*8]
+
+TRANSPOSE_BLOCKS_X8_LOOP_MMX:
+	; explictly loading next loop data
+%rep 4
+	mov r6, [r5]
+	mov r6, [r5+r3]
+	lea	r5, [r5+r3*2]
+%endrep
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m0, m3, m5, m2, m7, m1, m6, m4
+	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+	TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+	lea r0, [r0+8]
+	lea r2, [r2+2*r3]
+	dec r4
+	jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+
+	emms
+	add r7, 8
+	LOAD_5_PARA_POP
+	pop r6
+	pop r5
+	ret
--- /dev/null
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -1,0 +1,225 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+SECTION .text
+
+;**********************************************************************************************************************************
+;
+;	uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+;
+;	\note:
+;		src need align with 16 bytes, ref is optional
+;	\return value:
+;		return minimal SAD cost, according index carried by index_min_cost
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro   SAD_16x16_LINE_SSE41  4	; src, ref, stride_src, stride_ref
+    movdqa		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqu		xmm2, [%2+8h]
+    movdqa		xmm3, xmm1
+    movdqa		xmm4, xmm2
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm3, xmm0, 5	; 101 B
+    paddw		xmm7, xmm3		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 2	; 010 B
+    paddw		xmm7, xmm2		; accumulate cost
+
+    mpsadbw		xmm4, xmm0, 7	; 111 B
+    paddw		xmm7, xmm4		; accumulate cost
+
+    add			%1, %3
+    add			%2, %4
+%endmacro	; end of SAD_16x16_LINE_SSE41
+%macro   SAD_16x16_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
+    movdqa		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqu		xmm2, [%2+8h]
+    movdqa		xmm3, xmm1
+    movdqa		xmm4, xmm2
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm3, xmm0, 5	; 101 B
+    paddw		xmm7, xmm3		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 2	; 010 B
+    paddw		xmm7, xmm2		; accumulate cost
+
+    mpsadbw		xmm4, xmm0, 7	; 111 B
+    paddw		xmm7, xmm4		; accumulate cost
+%endmacro	; end of SAD_16x16_LINE_SSE41E
+
+WELS_EXTERN SampleSad16x16Hor8_sse41
+    ;push ebx
+    ;push esi
+    ;mov eax, [esp+12]	;   src
+    ;mov ecx, [esp+16]	;   stride_src
+    ;mov ebx, [esp+20]	;   ref
+    ;mov edx, [esp+24]	;   stride_ref
+    ;mov esi, [esp+28]	;   base_cost
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION	r1, r1d
+    SIGN_EXTENSION	r3, r3d
+    pxor	xmm7,	xmm7
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41E	r0, r2, r1, r3
+
+    pxor	xmm0,	xmm0
+    movdqa	xmm6,	xmm7
+    punpcklwd	xmm6,	xmm0
+    punpckhwd	xmm7,	xmm0
+
+    movdqa	xmm5,	[r4]
+    movdqa	xmm4,	xmm5
+    punpcklwd	xmm4,	xmm0
+    punpckhwd	xmm5,	xmm0
+
+    paddd	xmm4,	xmm6
+    paddd	xmm5,	xmm7
+    movdqa	xmm3,	xmm4
+    pminud	xmm3,	xmm5
+    pshufd	xmm2,	xmm3,	01001110B
+    pminud	xmm2,	xmm3
+    pshufd	xmm3,	xmm2,	10110001B
+    pminud	xmm2,	xmm3
+    movd	retrd,	xmm2
+    pcmpeqd	xmm4,	xmm2
+    movmskps	r2d, xmm4
+    bsf		r1d,	r2d
+    jnz	near WRITE_INDEX
+
+    pcmpeqd	xmm5,	xmm2
+    movmskps	r2d, xmm5
+    bsf		r1d,	r2d
+    add		r1d,	4
+
+WRITE_INDEX:
+    mov		[r5],	r1d
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+;**********************************************************************************************************************************
+;
+;	uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+;
+;	\note:
+;		src and ref is optional to align with 16 due inter 8x8
+;	\return value:
+;		return minimal SAD cost, according index carried by index_min_cost
+;
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro   SAD_8x8_LINE_SSE41  4	; src, ref, stride_src, stride_ref
+    movdqu		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqa		xmm2, xmm1
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 5	; 101 B
+    paddw		xmm7, xmm2		; accumulate cost
+
+    add			%1, %3
+    add			%2, %4
+%endmacro	; end of SAD_8x8_LINE_SSE41
+%macro   SAD_8x8_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
+    movdqu		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqa		xmm2, xmm1
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 5	; 101 B
+    paddw		xmm7, xmm2		; accumulate cost
+%endmacro	; end of SAD_8x8_LINE_SSE41E
+
+WELS_EXTERN SampleSad8x8Hor8_sse41
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION	r1, r1d
+    SIGN_EXTENSION	r3, r3d
+    movdqa xmm7, [r4]	;	load base cost list
+
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41E	r0, r2, r1, r3
+
+    phminposuw	xmm0, xmm7	; horizon search the minimal sad cost and its index
+    movd	retrd, xmm0	; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+    mov		r1d, retrd
+    and		retrd, 0xFFFF
+    sar		r1d, 16
+    mov		[r5], r1d
+
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -40,8 +40,10 @@
 	$(ENCODER_SRCDIR)/core/x86/coeff.asm\
 	$(ENCODER_SRCDIR)/core/x86/dct.asm\
 	$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
+	$(ENCODER_SRCDIR)/core/x86/matrix_transpose.asm\
 	$(ENCODER_SRCDIR)/core/x86/memzero.asm\
 	$(ENCODER_SRCDIR)/core/x86/quant.asm\
+	$(ENCODER_SRCDIR)/core/x86/sample_sc.asm\
 	$(ENCODER_SRCDIR)/core/x86/score.asm\
 
 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))
--- a/test/encoder/EncUT_MotionEstimate.cpp
+++ b/test/encoder/EncUT_MotionEstimate.cpp
@@ -5,6 +5,7 @@
 #include "sample.h"
 #include "svc_motion_estimate.h"
 #include "wels_func_ptr_def.h"
+#include "cpu.h"
 
 
 using namespace WelsSVCEnc;
@@ -43,11 +44,12 @@
     m_iMaxSearchBlock = 16;
     m_uiMvdTableSize	=  (1 + (648 << 1));
 
+    pMa = new CMemoryAlign(0);
     m_pRefPic = static_cast<uint8_t *>
-    (malloc(m_iWidth*m_iHeight));
+    (pMa->WelsMalloc(m_iWidth*m_iHeight, "RefPic"));
     ASSERT_TRUE( NULL != m_pRefPic );
     m_pSrcBlock = static_cast<uint8_t *>
-    (malloc(m_iMaxSearchBlock*m_iMaxSearchBlock));
+    (pMa->WelsMalloc(m_iMaxSearchBlock*m_iMaxSearchBlock, "SrcBlock"));
     ASSERT_TRUE( NULL != m_pSrcBlock );
     m_pMvdCostTable=new uint16_t[52*m_uiMvdTableSize];
     ASSERT_TRUE( NULL != m_pMvdCostTable );
@@ -54,8 +56,9 @@
   }
   virtual void TearDown() {
     delete [] m_pMvdCostTable;
-    free( m_pRefPic );
-    free( m_pSrcBlock );
+    pMa->WelsFree( m_pRefPic, "RefPic");
+    pMa->WelsFree( m_pSrcBlock, "SrcBlock");
+    delete pMa;
   }
 public:
   uint8_t *m_pRefPic;
@@ -66,6 +69,7 @@
   int32_t m_iWidth;
   int32_t m_iHeight;
   int32_t m_iMaxSearchBlock;
+  CMemoryAlign *pMa;
 };
 
 
@@ -243,4 +247,134 @@
     ASSERT_TRUE(iTryTimes > 0);
     //it is possible that ref at differnt position is identical, but that should be under a low probability
   }
-}
\ No newline at end of file
+}
+
+#ifdef X86_ASM
+TEST_F(MotionEstimateTest, TestVerticalSearch_SSE41)
+{
+  const int32_t kiMaxBlock16Sad = 72000;//a rough number
+  SWelsFuncPtrList sFuncList;
+  SWelsME sMe;
+
+  srand((uint32_t)time(NULL));
+  const uint8_t kuiQp = rand()%52;
+  InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+  SMVUnitXY sTargetMv;
+  WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+  WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+  uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+  sMe.iCurMeBlockPixX = (m_iWidth/2);
+  sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+  bool bDataGeneratorSucceed = false;
+  bool bFoundMatch = false;
+  int32_t iTryTimes=100;
+
+  sTargetMv.iMvX = 0;
+  sTargetMv.iMvY = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iHeight-INTPEL_NEEDED_MARGIN);
+  bDataGeneratorSucceed = false;
+  bFoundMatch = false;
+  while (!bFoundMatch && (iTryTimes--)>0) {
+    if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+      continue;
+
+    bDataGeneratorSucceed = true;
+    CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+    //clean the sMe status
+    sMe.uiBlockSize = rand()%5;
+    sMe.pEncMb = m_pSrcBlock;
+    sMe.pRefMb = pRefPicCenter;
+    sMe.pColoRefMb = pRefPicCenter;
+    sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+    sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+    const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+    const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+    const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+    const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+    uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX;	//do the offset here
+    uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+    VerticalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+                      pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],
+                      m_iMaxSearchBlock, m_iWidth,
+                      INTPEL_NEEDED_MARGIN,
+                      m_iHeight-INTPEL_NEEDED_MARGIN, true );
+
+    //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+    bFoundMatch = (sMe.sMv.iMvX==0
+                   &&(sMe.sMv.iMvY==sTargetMv.iMvY||abs(sMe.sMv.iMvY)<abs(sTargetMv.iMvY)));
+    //printf("TestVerticalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+  }
+  if (bDataGeneratorSucceed) {
+    //if DataGenerator never succeed, there is no meaning to check iTryTimes
+    ASSERT_TRUE(iTryTimes > 0);
+    //it is possible that ref at differnt position is identical, but that should be under a low probability
+  }
+}
+
+TEST_F(MotionEstimateTest, TestHorizontalSearch_SSE41)
+{
+  const int32_t kiMaxBlock16Sad = 72000;//a rough number
+  SWelsFuncPtrList sFuncList;
+  SWelsME sMe;
+
+  srand((uint32_t)time(NULL));
+  const uint8_t kuiQp = rand()%52;
+  InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+  SMVUnitXY sTargetMv;
+  WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+  WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+  uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+  sMe.iCurMeBlockPixX = (m_iWidth/2);
+  sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+  bool bDataGeneratorSucceed = false;
+  bool bFoundMatch = false;
+  int32_t iTryTimes=100;
+
+  sTargetMv.iMvX = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iWidth-INTPEL_NEEDED_MARGIN);
+  sTargetMv.iMvY = 0;
+  bDataGeneratorSucceed = false;
+  bFoundMatch = false;
+  while (!bFoundMatch && (iTryTimes--)>0) {
+    if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+      continue;
+
+    bDataGeneratorSucceed = true;
+    CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+    //clean the sMe status
+    sMe.uiBlockSize = rand()%5;
+    sMe.pEncMb = m_pSrcBlock;
+    sMe.pRefMb = pRefPicCenter;
+    sMe.pColoRefMb = pRefPicCenter;
+    sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+    sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+    const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+    const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+    const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+    const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+    uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX;	//do the offset here
+    uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+    HorizontalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+                      pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],
+                      m_iMaxSearchBlock, m_iWidth,
+                      INTPEL_NEEDED_MARGIN,
+                      m_iWidth-INTPEL_NEEDED_MARGIN, false );
+
+    //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+    bFoundMatch = (sMe.sMv.iMvY==0
+                   &&(sMe.sMv.iMvX==sTargetMv.iMvX||abs(sMe.sMv.iMvX)<abs(sTargetMv.iMvX)));
+    //printf("TestHorizontalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+  }
+  if (bDataGeneratorSucceed) {
+    //if DataGenerator never succeed, there is no meaning to check iTryTimes
+    ASSERT_TRUE(iTryTimes > 0);
+    //it is possible that ref at differnt position is identical, but that should be under a low probability
+  }
+}
+#endif
\ No newline at end of file