shithub: openh264

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

@@ -81,6 +81,7 @@

 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };

 		4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };

 		4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };

+		4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };

 		4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };

 		4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };

 		4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };

@@ -281,6 +282,7 @@

 		4CE446A918BC605C0017DF25 /* inc */ = {

 			isa = PBXGroup;

 			children = (

+				4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */,

 				4CE446AA18BC605C0017DF25 /* as264_common.h */,

 				4CE446AB18BC605C0017DF25 /* au_set.h */,

 				4CE446AC18BC605C0017DF25 /* bit_stream.h */,

--- a/codec/encoder/core/inc/svc_motion_estimate.h

+++ b/codec/encoder/core/inc/svc_motion_estimate.h

@@ -199,11 +199,24 @@

                         const int32_t kiEncStride, const int32_t kiRefStride,

                         const int32_t kiMinPos, const int32_t kiMaxPos,

                         const bool bVerticalSearch );

+#ifdef X86_ASM

+extern "C"

+{

+uint32_t SampleSad8x8Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);

+uint32_t SampleSad16x16Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);

+}

 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,

                             uint16_t* pMvdTable, const int32_t kiFixedMvd,

                             const int32_t kiEncStride, const int32_t kiRefStride,

                           const int32_t kiMinPos, const int32_t kiMaxPos,

                           const bool bVerticalSearch );

+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,

+                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,

+                                      const int32_t kiEncStride, const int32_t kiRefStride,

+                                      const int32_t kiMinPos, const int32_t kiMaxPos,

+                                      const bool bVerticalSearch );

+#endif

 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);

 // Feature Search Basics

--- a/codec/encoder/core/inc/wels_const.h

+++ b/codec/encoder/core/inc/wels_const.h

@@ -87,6 +87,7 @@

 #define PARA_SET_TYPE_SUBSETSPS	1

 #define PARA_SET_TYPE_PPS		2

+#define MAX_VERTICAL_MV_RANGE   1024  //TODO, for allocate enough memory for transpose

 #define MAX_FRAME_RATE			30	// maximal frame rate to support

 #define MIN_FRAME_RATE			1	// minimal frame rate need support

--- a/codec/encoder/core/inc/wels_func_ptr_def.h

+++ b/codec/encoder/core/inc/wels_func_ptr_def.h

@@ -134,6 +134,7 @@

 typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,

     uint8_t*, uint8_t*);

+typedef uint32_t (*PSampleSadHor8Func)( uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t* );

 typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,

                                    void* pSlice);

 typedef void (*PSearchMethodFunc) (SWelsFuncPtrList* pFuncList, void* pMe, void* pSlice, const int32_t kiEncStride, const int32_t kiRefStride);

@@ -202,14 +203,16 @@

   PGetIntraPredFunc     pfGetLumaI4x4Pred[I4_PRED_A];

   PGetIntraPredFunc     pfGetChromaPred[C_PRED_A];

+  PSampleSadHor8Func	pfSampleSadHor8[2];	// 0: for 16x16 square; 1: for 8x8 square

   PMotionSearchFunc

   pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c

   PSearchMethodFunc pfSearchMethod[BLOCK_SIZE_ALL];

   PCalculateSatdFunc pfCalculateSatd;

   PCheckDirectionalMv pfCheckDirectionalMv;

-  PLineFullSearchFunc pfLineFullSearch;

   PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16

   PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16

+  PLineFullSearchFunc pfVerticalFullSearch;

+  PLineFullSearchFunc pfHorizontalFullSearch;

   PCopyFunc      pfCopy16x16Aligned;    //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c

   PCopyFunc      pfCopy16x16NotAligned;  //md.c

--- /dev/null

+++ b/codec/encoder/core/inc/wels_transpose_matrix.h

@@ -1,0 +1,54 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifndef WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__

+#define WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__

+#include "typedefs.h"

+namespace WelsSVCEnc {

+#ifdef X86_ASM

+extern "C"

+{

+void TransposeMatrixBlocksx16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );

+void TransposeMatrixBlock16x16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );

+void TransposeMatrixBlocksx8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );

+void TransposeMatrixBlock8x8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );

+}

+#endif

+typedef void (*PTransposeMatrixBlockFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );

+typedef void (*PTransposeMatrixBlocksFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );

+}// end of namespace declaration

+#endif//WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__

--- a/codec/encoder/core/src/svc_motion_estimate.cpp

+++ b/codec/encoder/core/src/svc_motion_estimate.cpp

@@ -41,6 +41,7 @@

 #include "cpu_core.h"

 #include "ls_defines.h"

 #include "svc_motion_estimate.h"

+#include "wels_transpose_matrix.h"

 namespace WelsSVCEnc {

@@ -65,8 +66,14 @@

     pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;

     //for cross serarch

-    pFuncList->pfLineFullSearch = LineFullSearch_c;

+    pFuncList->pfVerticalFullSearch = LineFullSearch_c;

+    pFuncList->pfHorizontalFullSearch = LineFullSearch_c;

+#if defined (X86_ASM)

     if ( uiCpuFlag & WELS_CPU_SSE41 ) {

+      pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;

+      pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;

+      pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;

+      pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;

     //for feature search

@@ -75,6 +82,7 @@

     //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?

     pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;

     pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;

+#endif

@@ -302,6 +310,17 @@

 /////////////////////////

 // Cross Search Basics

 /////////////////////////

+#if defined (X86_ASM)

+void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )

+{

+  uint16_t *pBaseCost		= pMvdCost;

+  const int32_t kiOffset	= (kiStartMv<<2);

+  uint16_t *pMvd		= pMvdTable+kiOffset;

+  for (int32_t i = 0; i < 8; ++ i) {

+    pBaseCost[i] = ((*pMvd) + kiFixedCost);

+    pMvd += 4;

+  }

+}

 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,

                             uint16_t* pMvdTable, const int32_t kiFixedMvd,

                             const int32_t kiEncStride, const int32_t kiRefStride,

@@ -308,12 +327,130 @@

                           const int32_t kiMinPos, const int32_t kiMaxPos,

                           const bool bVerticalSearch ) {

   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);

-  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);

+  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);

+  uint8_t*  kpEncMb	= pMe->pEncMb;

+  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixY;

+  uint8_t* pRef			      = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];

+  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;

+  const int32_t kiEdgeBlocks	= kIsBlock16x16 ? 16 : 8;

+  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];

+  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];

+  PTransposeMatrixBlockFunc	TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;

+  PTransposeMatrixBlocksFunc	TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;

+  const int32_t kiDiff			= kiMaxPos - kiMinPos;

+  const int32_t kiRowNum		= WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);

+  const int32_t kiBlocksNum		= kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);

+  int32_t iCountLoop8		= (kiRowNum-kiEdgeBlocks) >> 3;

+  const int32_t kiRemainingVectors		= kiDiff - (iCountLoop8<<3);

+  const int32_t kiMatrixStride		= MAX_VERTICAL_MV_RANGE;

+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 );	// transpose matrix result for ref

+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 );				// transpose matrix result for enc

+  assert(kiRowNum <= kiMatrixStride);	// make sure effective memory

+  TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );

+  TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );

+  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );

+  int32_t iTargetPos			= kiMinPos;

+  int16_t iBestPos				= pMe->sMv.iMvX;

+  uint32_t uiBestCost			= pMe->uiSadCost;

+  uint32_t uiCostMin;

+  int32_t iIndexMinPos;

+  kpEncMb	= &uiMatrixEnc[0][0];

+  pRef	= &uiMatrixRef[0][0];

+  while(iCountLoop8 > 0) {

+    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);

+    uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );

+    if (uiCostMin < uiBestCost) {

+      uiBestCost	= uiCostMin;

+      iBestPos		= iTargetPos+iIndexMinPos;

+    }

+    iTargetPos	+= 8;

+    pRef += 8;

+    -- iCountLoop8;

+  }

+  if (kiRemainingVectors > 0) {

+    kpEncMb	= pMe->pEncMb;

+    pRef	= &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];

+    while (iTargetPos < kiMaxPos) {

+      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];

+      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);

+      if (uiSadCost < uiBestCost) {

+        uiBestCost	= uiSadCost;

+        iBestPos	= iTargetPos;

+      }

+      pRef += kiRefStride;

+      ++iTargetPos;

+    }

+  }

+  if (uiBestCost < pMe->uiSadCost) {

+    SMVUnitXY sBestMv;

+    sBestMv.iMvX = 0;

+    sBestMv.iMvY = iBestPos - kiCurMeBlockPix;

+    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );

+  }

-void LineFullSearch_c(  void *pFunc, void *vpMe,

-                          uint16_t* pMvdTable, const int32_t kiFixedMvd,

-                          const int32_t kiEncStride, const int32_t kiRefStride,

-                          const int32_t kiMinPos, const int32_t kiMaxPos,

+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,

+                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,

+                                      const int32_t kiEncStride, const int32_t kiRefStride,

+                                      const int32_t kiMinPos, const int32_t kiMaxPos,

+                                      const bool bVerticalSearch )

+{

+  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);

+  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);

+  uint8_t *kpEncMb	= pMe->pEncMb;

+  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixX;

+  uint8_t *pRef			      = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];

+  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;

+  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];

+  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];

+  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );

+  const int32_t kiNumVector	= kiMaxPos - kiMinPos;

+  int32_t iCountLoop8	= kiNumVector >> 3;

+  const int32_t kiRemainingLoop8	= kiNumVector & 7;

+  int32_t iTargetPos			= kiMinPos;

+  int16_t iBestPos				= pMe->sMv.iMvX;

+  uint32_t uiBestCost			= pMe->uiSadCost;

+  uint32_t uiCostMin;

+  int32_t iIndexMinPos;

+  while(iCountLoop8 > 0) {

+    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);

+    uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );

+    if (uiCostMin < uiBestCost) {

+      uiBestCost	= uiCostMin;

+      iBestPos		= iTargetPos+iIndexMinPos;

+    }

+    iTargetPos	+= 8;

+    pRef += 8;

+    -- iCountLoop8;

+  }

+  if ( kiRemainingLoop8 > 0 ) {

+    while (iTargetPos < kiMaxPos) {

+      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];

+      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);

+      if (uiSadCost < uiBestCost) {

+        uiBestCost	= uiSadCost;

+        iBestPos	= iTargetPos;

+      }

+      ++pRef;

+      ++iTargetPos;

+    }

+  }

+  if (uiBestCost < pMe->uiSadCost) {

+    SMVUnitXY sBestMv;

+    sBestMv.iMvX = iBestPos - kiCurMeBlockPix;

+    sBestMv.iMvY = 0;

+    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );

+  }

+}

+#endif

+void LineFullSearch_c(	void *pFunc, void *vpMe,

+													uint16_t* pMvdTable, const int32_t kiFixedMvd,

+													const int32_t kiEncStride, const int32_t kiRefStride,

+													const int32_t kiMinPos, const int32_t kiMaxPos,

                           const bool bVerticalSearch ) {

   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);

   SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);

@@ -346,8 +483,8 @@

 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SWelsME * pMe,

 											const SSlice* pSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {

-  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfLineFullSearch;

-  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfLineFullSearch;

+  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfVerticalFullSearch;

+  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfHorizontalFullSearch;

   const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;

   const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);

--- /dev/null

+++ b/codec/encoder/core/x86/matrix_transpose.asm

@@ -1,0 +1,395 @@

+;*!

+;* \copy

+;*     Copyright (c)  2009-2013, Cisco Systems

+;*     All rights reserved.

+;*

+;*     Redistribution and use in source and binary forms, with or without

+;*     modification, are permitted provided that the following conditions

+;*     are met:

+;*

+;*        ?Redistributions of source code must retain the above copyright

+;*          notice, this list of conditions and the following disclaimer.

+;*

+;*        ?Redistributions in binary form must reproduce the above copyright

+;*          notice, this list of conditions and the following disclaimer in

+;*          the documentation and/or other materials provided with the

+;*          distribution.

+;*

+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+;*     POSSIBILITY OF SUCH DAMAGE.

+;*************************************************************************/

+%include "asm_inc.asm"

+;in:  m0, m1, m2, m3, m4, m5, m6, m7

+;out: m0, m3, m5, m2, m7, m1, m6, m4

+%macro TRANSPOSE_8x8B_MMX 10

+	MMX_XSwap bw,  %1, %2, %8

+	MMX_XSwap bw,  %3, %4, %2

+	MMX_XSwap bw,  %5, %6, %4

+	movq	%6, %9

+	movq	%10, %4

+	MMX_XSwap bw,  %7, %6, %4

+	MMX_XSwap wd,  %1, %3, %6

+	MMX_XSwap wd,  %8, %2, %3

+	MMX_XSwap wd,  %5, %7, %2

+	movq	%7, %10

+	movq	%10, %3

+	MMX_XSwap wd,  %7, %4, %3

+	MMX_XSwap dq,  %1, %5, %4

+	MMX_XSwap dq,  %6, %2, %5

+	MMX_XSwap dq,  %8, %7, %2

+	movq	%7, %10

+	movq	%10, %5

+	MMX_XSwap dq,  %7, %3, %5

+	movq	%3, %10

+%endmacro

+;in: m0, m3, m5, m2, m7, m1, m6, m4

+%macro TRANSPOSE8x8_WRITE_MMX 2	; dst, dst_stride

+	movq [%1], mm0			; result of line 1, x8 bytes

+	movq [%1+%2], mm3		; result of line 2

+	lea %1, [%1+2*%2]

+	movq [%1], mm5			; result of line 3

+	movq [%1+%2], mm2		; result of line 4

+	lea %1, [%1+2*%2]

+	movq [%1], mm7			; result of line 5

+	movq [%1+%2], mm1		; result of line 6

+	lea %1, [%1+2*%2]

+	movq [%1], mm6			; result of line 7

+	movq [%1+%2], mm4		; result of line 8

+%endmacro

+;in: m0, m3, m5, m2, m7, m1, m6, m4

+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3	; dst, dst_stride, reg32

+	movq [%1], mm0			; result of line 1, x8 bytes

+	movq [%1+%2], mm3		; result of line 2

+	lea %3, [%1+2*%2]

+	movq [%3], mm5			; result of line 3

+	movq [%3+%2], mm2		; result of line 4

+	lea %3, [%3+2*%2]

+	movq [%3], mm7			; result of line 5

+	movq [%3+%2], mm1		; result of line 6

+	lea %3, [%3+2*%2]

+	movq [%3], mm6			; result of line 7

+	movq [%3+%2], mm4		; result of line 8

+%endmacro	; end of TRANSPOSE8x8_WRITE_ALT_MMX

+; for transpose 16x8

+;in:  m0, m1, m2, m3, m4, m5, m6, m7

+;out: m4, m2, m3, m7, m5, m1, m6, m0

+%macro TRANSPOSE_8x16B_SSE2		10

+	SSE2_XSawp bw,  %1, %2, %8

+	SSE2_XSawp bw,  %3, %4, %2

+	SSE2_XSawp bw,  %5, %6, %4

+	movdqa	%6, %9

+	movdqa	%10, %4

+	SSE2_XSawp bw,  %7, %6, %4

+	SSE2_XSawp wd,  %1, %3, %6

+	SSE2_XSawp wd,  %8, %2, %3

+	SSE2_XSawp wd,  %5, %7, %2

+	movdqa	%7, %10

+	movdqa	%10, %3

+	SSE2_XSawp wd,  %7, %4, %3

+	SSE2_XSawp dq,  %1, %5, %4

+	SSE2_XSawp dq,  %6, %2, %5

+	SSE2_XSawp dq,  %8, %7, %2

+	movdqa	%7, %10

+	movdqa	%10, %5

+	SSE2_XSawp dq,  %7, %3, %5

+	SSE2_XSawp qdq,  %1, %8, %3

+	SSE2_XSawp qdq,  %4, %2, %8

+	SSE2_XSawp qdq,  %6, %7, %2

+	movdqa	%7, %10

+	movdqa	%10, %1

+	SSE2_XSawp qdq,  %7, %5, %1

+	movdqa	%5, %10

+%endmacro	; end of TRANSPOSE_8x16B_SSE2

+%macro TRANSPOSE8x16_WRITE_SSE2	2	; dst, dst_stride

+	movq [%1], xmm4			; result of line 1, x8 bytes

+	movq [%1+%2], xmm2		; result of line 2

+	lea %1, [%1+2*%2]

+	movq [%1], xmm3			; result of line 3

+	movq [%1+%2], xmm7		; result of line 4

+	lea %1, [%1+2*%2]

+	movq [%1], xmm5			; result of line 5

+	movq [%1+%2], xmm1		; result of line 6

+	lea %1, [%1+2*%2]

+	movq [%1], xmm6			; result of line 7

+	movq [%1+%2], xmm0		; result of line 8

+	lea %1, [%1+2*%2]

+	movhpd [%1], xmm4		; result of line 9

+	movhpd [%1+%2], xmm2	; result of line 10

+	lea %1, [%1+2*%2]

+	movhpd [%1], xmm3		; result of line 11

+	movhpd [%1+%2], xmm7	; result of line 12

+	lea %1, [%1+2*%2]

+	movhpd [%1], xmm5		; result of line 13

+	movhpd [%1+%2], xmm1	; result of line 14

+	lea %1, [%1+2*%2]

+	movhpd [%1], xmm6		; result of line 15

+	movhpd [%1+%2], xmm0	; result of line 16

+%endmacro	; end of TRANSPOSE_WRITE_RESULT_SSE2

+%macro TRANSPOSE8x16_WRITE_ALT_SSE2	3	; dst, dst_stride, reg32

+	movq [%1], xmm4			; result of line 1, x8 bytes

+	movq [%1+%2], xmm2		; result of line 2

+	lea %3, [%1+2*%2]

+	movq [%3], xmm3			; result of line 3

+	movq [%3+%2], xmm7		; result of line 4

+	lea %3, [%3+2*%2]

+	movq [%3], xmm5			; result of line 5

+	movq [%3+%2], xmm1		; result of line 6

+	lea %3, [%3+2*%2]

+	movq [%3], xmm6			; result of line 7

+	movq [%3+%2], xmm0		; result of line 8

+	lea %3, [%3+2*%2]

+	movhpd [%3], xmm4		; result of line 9

+	movhpd [%3+%2], xmm2	; result of line 10

+	lea %3, [%3+2*%2]

+	movhpd [%3], xmm3		; result of line 11

+	movhpd [%3+%2], xmm7	; result of line 12

+	lea %3, [%3+2*%2]

+	movhpd [%3], xmm5		; result of line 13

+	movhpd [%3+%2], xmm1	; result of line 14

+	lea %3, [%3+2*%2]

+	movhpd [%3], xmm6		; result of line 15

+	movhpd [%3+%2], xmm0	; result of line 16

+%endmacro	; end of TRANSPOSE8x16_WRITE_ALT_SSE2

+SECTION .text

+WELS_EXTERN TransposeMatrixBlock16x16_sse2

+; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );

+	push r4

+	push r5

+	%assign push_num 2

+	LOAD_4_PARA

+	PUSH_XMM 8

+	SIGN_EXTENSION	r1, r1d

+	SIGN_EXTENSION	r3, r3d

+	mov r4, r7

+	and r4, 0Fh

+	sub r7, 10h

+	sub r7, r4

+	lea r5, [r3+r3*2]

+	; top 8x16 block

+	movdqa xmm0, [r2]

+	movdqa xmm1, [r2+r3]

+	movdqa xmm2, [r2+r3*2]

+	movdqa xmm3, [r2+r5]

+	lea r2, [r2+r3*4]

+	movdqa xmm4, [r2]

+	movdqa xmm5, [r2+r3]

+	movdqa xmm6, [r2+r3*2]

+	;in:  m0, m1, m2, m3, m4, m5, m6, m7

+	;out: m4, m2, m3, m7, m5, m1, m6, m0

+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]

+	TRANSPOSE8x16_WRITE_SSE2		r0, r1

+	; bottom 8x16 block

+	lea	r2, [r2+r3*4]

+	movdqa xmm0, [r2]

+	movdqa xmm1, [r2+r3]

+	movdqa xmm2, [r2+r3*2]

+	movdqa xmm3, [r2+r5]

+	lea r2, [r2+r3*4]

+	movdqa xmm4, [r2]

+	movdqa xmm5, [r2+r3]

+	movdqa xmm6, [r2+r3*2]

+	;in:  m0, m1, m2, m3, m4, m5, m6, m7

+	;out: m4, m2, m3, m7, m5, m1, m6, m0

+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]

+	mov r5, r1

+	sal r5, 4

+	sub r0, r5

+	lea r0, [r0+r1*2+8]

+	TRANSPOSE8x16_WRITE_SSE2		r0, r1

+	add r7, r4

+	add r7, 10h

+	POP_XMM

+	LOAD_4_PARA_POP

+	pop r5

+	pop r4

+	ret

+WELS_EXTERN TransposeMatrixBlocksx16_sse2

+; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );

+	push r5

+	push r6

+	%assign push_num 2

+	LOAD_5_PARA

+	PUSH_XMM 8

+	SIGN_EXTENSION  r1, r1d

+	SIGN_EXTENSION  r3, r3d

+	SIGN_EXTENSION  r4, r4d

+	mov r5, r7

+	and r5, 0Fh

+	sub r7, 10h

+	sub r7, r5

+TRANSPOSE_LOOP_SSE2:

+	; explictly loading next loop data

+	lea	r6, [r2+r3*8]

+	push r4

+%rep 8

+	mov	r4, [r6]

+	mov	r4, [r6+r3]

+	lea	r6, [r6+r3*2]

+%endrep

+	pop r4

+	; top 8x16 block

+	movdqa xmm0, [r2]

+	movdqa xmm1, [r2+r3]

+	lea r2, [r2+r3*2]

+	movdqa xmm2, [r2]

+	movdqa xmm3, [r2+r3]

+	lea r2, [r2+r3*2]

+	movdqa xmm4, [r2]

+	movdqa xmm5, [r2+r3]

+	lea r2, [r2+r3*2]

+	movdqa xmm6, [r2]

+	;in:  m0, m1, m2, m3, m4, m5, m6, m7

+	;out: m4, m2, m3, m7, m5, m1, m6, m0

+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]

+	TRANSPOSE8x16_WRITE_ALT_SSE2		r0, r1, r6

+	lea	r2, [r2+r3*2]

+	; bottom 8x16 block

+	movdqa xmm0, [r2]

+	movdqa xmm1, [r2+r3]

+	lea	r2, [r2+r3*2]

+	movdqa xmm2, [r2]

+	movdqa xmm3, [r2+r3]

+	lea r2, [r2+r3*2]

+	movdqa xmm4, [r2]

+	movdqa xmm5, [r2+r3]

+	lea	r2, [r2+r3*2]

+	movdqa xmm6, [r2]

+	;in:  m0, m1, m2, m3, m4, m5, m6, m7

+	;out: m4, m2, m3, m7, m5, m1, m6, m0

+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]

+	TRANSPOSE8x16_WRITE_ALT_SSE2		r0+8, r1, r6

+	lea	r2, [r2+r3*2]

+	lea r0, [r0+16]

+	dec r4

+	jg near TRANSPOSE_LOOP_SSE2

+	add r7, r5

+	add r7, 10h

+	POP_XMM

+	LOAD_5_PARA_POP

+	pop r6

+	pop r5

+	ret

+WELS_EXTERN TransposeMatrixBlock8x8_mmx

+; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );

+	%assign push_num 0

+	LOAD_4_PARA

+	SIGN_EXTENSION  r1, r1d

+	SIGN_EXTENSION  r3, r3d

+	sub	r7, 8

+	movq mm0, [r2]

+	movq mm1, [r2+r3]

+	lea r2, [r2+2*r3]

+	movq mm2, [r2]

+	movq mm3, [r2+r3]

+	lea r2, [r2+2*r3]

+	movq mm4, [r2]

+	movq mm5, [r2+r3]

+	lea r2, [r2+2*r3]

+	movq mm6, [r2]

+	;in:  m0, m1, m2, m3, m4, m5, m6, m7

+	;out: m0, m3, m5, m2, m7, m1, m6, m4

+	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]

+	TRANSPOSE8x8_WRITE_MMX r0, r1

+	emms

+	add r7, 8

+	LOAD_4_PARA_POP

+	ret

+WELS_EXTERN TransposeMatrixBlocksx8_mmx

+; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );

+	push r5

+	push r6

+	%assign push_num 2

+	LOAD_5_PARA

+	SIGN_EXTENSION  r1, r1d

+	SIGN_EXTENSION  r3, r3d

+	SIGN_EXTENSION  r4, r4d

+	sub	r7, 8

+	lea	r5, [r2+r3*8]

+TRANSPOSE_BLOCKS_X8_LOOP_MMX:

+	; explictly loading next loop data

+%rep 4

+	mov r6, [r5]

+	mov r6, [r5+r3]

+	lea	r5, [r5+r3*2]

+%endrep

+	movq mm0, [r2]

+	movq mm1, [r2+r3]

+	lea r2, [r2+2*r3]

+	movq mm2, [r2]

+	movq mm3, [r2+r3]

+	lea r2, [r2+2*r3]

+	movq mm4, [r2]

+	movq mm5, [r2+r3]

+	lea r2, [r2+2*r3]

+	movq mm6, [r2]

+	;in:  m0, m1, m2, m3, m4, m5, m6, m7

+	;out: m0, m3, m5, m2, m7, m1, m6, m4

+	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]

+	TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6

+	lea r0, [r0+8]

+	lea r2, [r2+2*r3]

+	dec r4

+	jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX

+	emms

+	add r7, 8

+	LOAD_5_PARA_POP

+	pop r6

+	pop r5

+	ret

--- /dev/null

+++ b/codec/encoder/core/x86/sample_sc.asm

@@ -1,0 +1,225 @@

+;*!

+;* \copy

+;*     Copyright (c)  2009-2013, Cisco Systems

+;*     All rights reserved.

+;*

+;*     Redistribution and use in source and binary forms, with or without

+;*     modification, are permitted provided that the following conditions

+;*     are met:

+;*

+;*        * Redistributions of source code must retain the above copyright

+;*          notice, this list of conditions and the following disclaimer.

+;*

+;*        * Redistributions in binary form must reproduce the above copyright

+;*          notice, this list of conditions and the following disclaimer in

+;*          the documentation and/or other materials provided with the

+;*          distribution.

+;*

+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+;*     POSSIBILITY OF SUCH DAMAGE.

+;*

+;*************************************************************************/

+%include "asm_inc.asm"

+SECTION .text

+;**********************************************************************************************************************************

+;

+;	uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )

+;

+;	\note:

+;		src need align with 16 bytes, ref is optional

+;	\return value:

+;		return minimal SAD cost, according index carried by index_min_cost

+;**********************************************************************************************************************************

+; try 8 mv via offset

+; xmm7 store sad costs

+%macro   SAD_16x16_LINE_SSE41  4	; src, ref, stride_src, stride_ref

+    movdqa		xmm0, [%1]

+    movdqu		xmm1, [%2]

+    movdqu		xmm2, [%2+8h]

+    movdqa		xmm3, xmm1

+    movdqa		xmm4, xmm2

+    mpsadbw		xmm1, xmm0, 0	; 000 B

+    paddw		xmm7, xmm1		; accumulate cost

+    mpsadbw		xmm3, xmm0, 5	; 101 B

+    paddw		xmm7, xmm3		; accumulate cost

+    mpsadbw		xmm2, xmm0, 2	; 010 B

+    paddw		xmm7, xmm2		; accumulate cost

+    mpsadbw		xmm4, xmm0, 7	; 111 B

+    paddw		xmm7, xmm4		; accumulate cost

+    add			%1, %3

+    add			%2, %4

+%endmacro	; end of SAD_16x16_LINE_SSE41

+%macro   SAD_16x16_LINE_SSE41E  4	; src, ref, stride_src, stride_ref

+    movdqa		xmm0, [%1]

+    movdqu		xmm1, [%2]

+    movdqu		xmm2, [%2+8h]

+    movdqa		xmm3, xmm1

+    movdqa		xmm4, xmm2

+    mpsadbw		xmm1, xmm0, 0	; 000 B

+    paddw		xmm7, xmm1		; accumulate cost

+    mpsadbw		xmm3, xmm0, 5	; 101 B

+    paddw		xmm7, xmm3		; accumulate cost

+    mpsadbw		xmm2, xmm0, 2	; 010 B

+    paddw		xmm7, xmm2		; accumulate cost

+    mpsadbw		xmm4, xmm0, 7	; 111 B

+    paddw		xmm7, xmm4		; accumulate cost

+%endmacro	; end of SAD_16x16_LINE_SSE41E

+WELS_EXTERN SampleSad16x16Hor8_sse41

+    ;push ebx

+    ;push esi

+    ;mov eax, [esp+12]	;   src

+    ;mov ecx, [esp+16]	;   stride_src

+    ;mov ebx, [esp+20]	;   ref

+    ;mov edx, [esp+24]	;   stride_ref

+    ;mov esi, [esp+28]	;   base_cost

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION	r1, r1d

+    SIGN_EXTENSION	r3, r3d

+    pxor	xmm7,	xmm7

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3

+    SAD_16x16_LINE_SSE41E	r0, r2, r1, r3

+    pxor	xmm0,	xmm0

+    movdqa	xmm6,	xmm7

+    punpcklwd	xmm6,	xmm0

+    punpckhwd	xmm7,	xmm0

+    movdqa	xmm5,	[r4]

+    movdqa	xmm4,	xmm5

+    punpcklwd	xmm4,	xmm0

+    punpckhwd	xmm5,	xmm0

+    paddd	xmm4,	xmm6

+    paddd	xmm5,	xmm7

+    movdqa	xmm3,	xmm4

+    pminud	xmm3,	xmm5

+    pshufd	xmm2,	xmm3,	01001110B

+    pminud	xmm2,	xmm3

+    pshufd	xmm3,	xmm2,	10110001B

+    pminud	xmm2,	xmm3

+    movd	retrd,	xmm2

+    pcmpeqd	xmm4,	xmm2

+    movmskps	r2d, xmm4

+    bsf		r1d,	r2d

+    jnz	near WRITE_INDEX

+    pcmpeqd	xmm5,	xmm2

+    movmskps	r2d, xmm5

+    bsf		r1d,	r2d

+    add		r1d,	4

+WRITE_INDEX:

+    mov		[r5],	r1d

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

+;**********************************************************************************************************************************

+;

+;	uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )

+;

+;	\note:

+;		src and ref is optional to align with 16 due inter 8x8

+;	\return value:

+;		return minimal SAD cost, according index carried by index_min_cost

+;

+;**********************************************************************************************************************************

+; try 8 mv via offset

+; xmm7 store sad costs

+%macro   SAD_8x8_LINE_SSE41  4	; src, ref, stride_src, stride_ref

+    movdqu		xmm0, [%1]

+    movdqu		xmm1, [%2]

+    movdqa		xmm2, xmm1

+    mpsadbw		xmm1, xmm0, 0	; 000 B

+    paddw		xmm7, xmm1		; accumulate cost

+    mpsadbw		xmm2, xmm0, 5	; 101 B

+    paddw		xmm7, xmm2		; accumulate cost

+    add			%1, %3

+    add			%2, %4

+%endmacro	; end of SAD_8x8_LINE_SSE41

+%macro   SAD_8x8_LINE_SSE41E  4	; src, ref, stride_src, stride_ref

+    movdqu		xmm0, [%1]

+    movdqu		xmm1, [%2]

+    movdqa		xmm2, xmm1

+    mpsadbw		xmm1, xmm0, 0	; 000 B

+    paddw		xmm7, xmm1		; accumulate cost

+    mpsadbw		xmm2, xmm0, 5	; 101 B

+    paddw		xmm7, xmm2		; accumulate cost

+%endmacro	; end of SAD_8x8_LINE_SSE41E

+WELS_EXTERN SampleSad8x8Hor8_sse41

+    %assign  push_num 0

+    LOAD_6_PARA

+    PUSH_XMM 8

+    SIGN_EXTENSION	r1, r1d

+    SIGN_EXTENSION	r3, r3d

+    movdqa xmm7, [r4]	;	load base cost list

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3

+    SAD_8x8_LINE_SSE41E	r0, r2, r1, r3

+    phminposuw	xmm0, xmm7	; horizon search the minimal sad cost and its index

+    movd	retrd, xmm0	; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX

+    mov		r1d, retrd

+    and		retrd, 0xFFFF

+    sar		r1d, 16

+    mov		[r5], r1d

+    POP_XMM

+    LOAD_6_PARA_POP

+    ret

--- a/codec/encoder/targets.mk

+++ b/codec/encoder/targets.mk

@@ -40,8 +40,10 @@

 	$(ENCODER_SRCDIR)/core/x86/coeff.asm\

 	$(ENCODER_SRCDIR)/core/x86/dct.asm\

 	$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\

+	$(ENCODER_SRCDIR)/core/x86/matrix_transpose.asm\

 	$(ENCODER_SRCDIR)/core/x86/memzero.asm\

 	$(ENCODER_SRCDIR)/core/x86/quant.asm\

+	$(ENCODER_SRCDIR)/core/x86/sample_sc.asm\

 	$(ENCODER_SRCDIR)/core/x86/score.asm\

 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))

--- a/test/encoder/EncUT_MotionEstimate.cpp

+++ b/test/encoder/EncUT_MotionEstimate.cpp

@@ -5,6 +5,7 @@

 #include "sample.h"

 #include "svc_motion_estimate.h"

 #include "wels_func_ptr_def.h"

+#include "cpu.h"

 using namespace WelsSVCEnc;

@@ -43,11 +44,12 @@

     m_iMaxSearchBlock = 16;

     m_uiMvdTableSize	=  (1 + (648 << 1));

+    pMa = new CMemoryAlign(0);

     m_pRefPic = static_cast<uint8_t *>

-    (malloc(m_iWidth*m_iHeight));

+    (pMa->WelsMalloc(m_iWidth*m_iHeight, "RefPic"));

     ASSERT_TRUE( NULL != m_pRefPic );

     m_pSrcBlock = static_cast<uint8_t *>

-    (malloc(m_iMaxSearchBlock*m_iMaxSearchBlock));

+    (pMa->WelsMalloc(m_iMaxSearchBlock*m_iMaxSearchBlock, "SrcBlock"));

     ASSERT_TRUE( NULL != m_pSrcBlock );

     m_pMvdCostTable=new uint16_t[52*m_uiMvdTableSize];

     ASSERT_TRUE( NULL != m_pMvdCostTable );

@@ -54,8 +56,9 @@

   virtual void TearDown() {

     delete [] m_pMvdCostTable;

-    free( m_pRefPic );

-    free( m_pSrcBlock );

+    pMa->WelsFree( m_pRefPic, "RefPic");

+    pMa->WelsFree( m_pSrcBlock, "SrcBlock");

+    delete pMa;

 public:

   uint8_t *m_pRefPic;

@@ -66,6 +69,7 @@

   int32_t m_iWidth;

   int32_t m_iHeight;

   int32_t m_iMaxSearchBlock;

+  CMemoryAlign *pMa;

};

@@ -243,4 +247,134 @@

     ASSERT_TRUE(iTryTimes > 0);

     //it is possible that ref at differnt position is identical, but that should be under a low probability

-}

\ No newline at end of file

+}

+#ifdef X86_ASM

+TEST_F(MotionEstimateTest, TestVerticalSearch_SSE41)

+{

+  const int32_t kiMaxBlock16Sad = 72000;//a rough number

+  SWelsFuncPtrList sFuncList;

+  SWelsME sMe;

+  srand((uint32_t)time(NULL));

+  const uint8_t kuiQp = rand()%52;

+  InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);

+  SMVUnitXY sTargetMv;

+  WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions

+  WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);

+  uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);

+  sMe.iCurMeBlockPixX = (m_iWidth/2);

+  sMe.iCurMeBlockPixY = (m_iHeight/2);

+  bool bDataGeneratorSucceed = false;

+  bool bFoundMatch = false;

+  int32_t iTryTimes=100;

+  sTargetMv.iMvX = 0;

+  sTargetMv.iMvY = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iHeight-INTPEL_NEEDED_MARGIN);

+  bDataGeneratorSucceed = false;

+  bFoundMatch = false;

+  while (!bFoundMatch && (iTryTimes--)>0) {

+    if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))

+      continue;

+    bDataGeneratorSucceed = true;

+    CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);

+    //clean the sMe status

+    sMe.uiBlockSize = rand()%5;

+    sMe.pEncMb = m_pSrcBlock;

+    sMe.pRefMb = pRefPicCenter;

+    sMe.pColoRefMb = pRefPicCenter;

+    sMe.sMv.iMvX = sMe.sMv.iMvY = 0;

+    sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;

+    const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;

+    const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);

+    const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;

+    const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);

+    uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX;	//do the offset here

+    uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;

+    VerticalFullSearchUsingSSE41 ( &sFuncList, &sMe,

+                      pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],

+                      m_iMaxSearchBlock, m_iWidth,

+                      INTPEL_NEEDED_MARGIN,

+                      m_iHeight-INTPEL_NEEDED_MARGIN, true );

+    //the last selection may be affected by MVDcost, that is when smaller MvY will be better

+    bFoundMatch = (sMe.sMv.iMvX==0

+                   &&(sMe.sMv.iMvY==sTargetMv.iMvY||abs(sMe.sMv.iMvY)<abs(sTargetMv.iMvY)));

+    //printf("TestVerticalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);

+  }

+  if (bDataGeneratorSucceed) {

+    //if DataGenerator never succeed, there is no meaning to check iTryTimes

+    ASSERT_TRUE(iTryTimes > 0);

+    //it is possible that ref at differnt position is identical, but that should be under a low probability

+  }

+}

+TEST_F(MotionEstimateTest, TestHorizontalSearch_SSE41)

+{

+  const int32_t kiMaxBlock16Sad = 72000;//a rough number

+  SWelsFuncPtrList sFuncList;

+  SWelsME sMe;

+  srand((uint32_t)time(NULL));

+  const uint8_t kuiQp = rand()%52;

+  InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);

+  SMVUnitXY sTargetMv;

+  WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions

+  WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);

+  uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);

+  sMe.iCurMeBlockPixX = (m_iWidth/2);

+  sMe.iCurMeBlockPixY = (m_iHeight/2);

+  bool bDataGeneratorSucceed = false;

+  bool bFoundMatch = false;

+  int32_t iTryTimes=100;

+  sTargetMv.iMvX = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iWidth-INTPEL_NEEDED_MARGIN);

+  sTargetMv.iMvY = 0;

+  bDataGeneratorSucceed = false;

+  bFoundMatch = false;

+  while (!bFoundMatch && (iTryTimes--)>0) {

+    if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))

+      continue;

+    bDataGeneratorSucceed = true;

+    CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);

+    //clean the sMe status

+    sMe.uiBlockSize = rand()%5;

+    sMe.pEncMb = m_pSrcBlock;

+    sMe.pRefMb = pRefPicCenter;

+    sMe.pColoRefMb = pRefPicCenter;

+    sMe.sMv.iMvX = sMe.sMv.iMvY = 0;

+    sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;

+    const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;

+    const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);

+    const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;

+    const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);

+    uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX;	//do the offset here

+    uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;

+    HorizontalFullSearchUsingSSE41 ( &sFuncList, &sMe,

+                      pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],

+                      m_iMaxSearchBlock, m_iWidth,

+                      INTPEL_NEEDED_MARGIN,

+                      m_iWidth-INTPEL_NEEDED_MARGIN, false );

+    //the last selection may be affected by MVDcost, that is when smaller MvY will be better

+    bFoundMatch = (sMe.sMv.iMvY==0

+                   &&(sMe.sMv.iMvX==sTargetMv.iMvX||abs(sMe.sMv.iMvX)<abs(sTargetMv.iMvX)));

+    //printf("TestHorizontalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);

+  }

+  if (bDataGeneratorSucceed) {

+    //if DataGenerator never succeed, there is no meaning to check iTryTimes

+    ASSERT_TRUE(iTryTimes > 0);

+    //it is possible that ref at differnt position is identical, but that should be under a low probability

+  }

+}

+#endif

\ No newline at end of file

--

⑨