shithub: openh264

Download patch

ref: 6cb48fc54709b7e9a72b3218a17958c3787c10bf
parent: bbc0cb2b2cd7e6ad1421477c374b98a4dda3918b
parent: f41424183406efe176b9fcb52fbc92e3fc7923e8
author: volvet <qizh@cisco.com>
date: Wed Apr 16 13:41:17 EDT 2014

Merge pull request #690 from sijchen/fme_merge65

[Encoder ME] Add calling of FME preprocess calculation

Approved by Xiaolin. 

--- a/codec/encoder/core/inc/picture.h
+++ b/codec/encoder/core/inc/picture.h
@@ -56,18 +56,6 @@
   bool      bRefBlockFeatureCalculated; // flag of whether pre-process is done
 } SScreenBlockFeatureStorage; //should be stored with RefPic, one for each frame
 
-typedef struct TagFeatureSearchPreparation{
-  SScreenBlockFeatureStorage*  pRefBlockFeature;//point the the ref frame storage
-
-  uint16_t*  pFeatureOfBlock;    // Feature of every block (8x8), begin with the point
-  uint8_t      uiFeatureStrategyIndex;// index of hash strategy
-
-  /* for FME frame-level switch */
-  bool bFMESwitchFlag;
-  uint8_t uiFMEGoodFrameCount;
-  int32_t iHighFreMbCount;
-}SFeatureSearchPreparation;//maintain only one
-
 /*
  *  Reconstructed Picture definition
  *  It is used to express reference picture, also consequent reconstruction picture for output
--- a/codec/encoder/core/inc/svc_enc_frame.h
+++ b/codec/encoder/core/inc/svc_enc_frame.h
@@ -56,8 +56,20 @@
 ///////////////////////////////////DQ Layer level///////////////////////////////////
 
 typedef struct TagDqLayer	SDqLayer;
-typedef SDqLayer* 			pDqLayer;
+typedef SDqLayer*            pDqLayer;
 
+typedef struct TagFeatureSearchPreparation{
+  SScreenBlockFeatureStorage*	pRefBlockFeature;//point the the ref frame storage
+
+  uint16_t*	pFeatureOfBlock;		// Feature of every block (8x8), begin with the point
+	uint8_t      uiFeatureStrategyIndex;// index of hash strategy
+
+	/* for FME frame-level switch */
+	bool bFMESwitchFlag;
+	uint8_t uiFMEGoodFrameCount;
+	int32_t iHighFreMbCount;
+}SFeatureSearchPreparation;//maintain only one
+
 typedef struct TagLayerInfo {
   SNalUnitHeaderExt		sNalHeaderExt;
   SSlice*
@@ -97,6 +109,8 @@
   int32_t*					pNumSliceCodedOfPartition;		// for dynamic slicing mode
   int32_t*					pLastCodedMbIdxOfPartition;	// for dynamic slicing mode
   int32_t*					pLastMbIdxOfPartition;			// for dynamic slicing mode
+
+  SFeatureSearchPreparation* pFeatureSearchPreparation;
 
   SDqLayer*				pRefLayer;		// pointer to referencing dq_layer of current layer to be decoded
 
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -222,6 +222,8 @@
 #define LIST_SIZE_SUM_16x16  0x0FF01    //(256*255+1)
 #define LIST_SIZE_SUM_8x8      0x03FC1    //(64*255+1)
 #define LIST_SIZE_MSE_16x16  0x00878    //(avg+mse)/2, max= (255+16*255)/2
+#define FMESWITCH_MBSAD_THRESHOLD   30 // empirically set.
+
 int32_t SumOf8x8SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride);
 int32_t SumOf16x16SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride);
 void SumOf8x8BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
@@ -231,6 +233,13 @@
 int32_t RequestScreenBlockFeatureStorage( CMemoryAlign *pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage,
                                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);
 int32_t ReleaseScreenBlockFeatureStorage( CMemoryAlign *pMa, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage );
+int32_t RequestFeatureSearchPreparation( CMemoryAlign *pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage,
+                                         SFeatureSearchPreparation* pFeatureSearchPreparation);
+int32_t ReleaseFeatureSearchPreparation( CMemoryAlign *pMa, uint16_t*& pFeatureOfBlock);
+#define FME_DEFAULT_GOOD_FRAME_NUM (2)
+#define FME_DEFAULT_FEATURE_INDEX (0)
+void PerformFMEPreprocess( SWelsFuncPtrList *pFunc, SPicture* pRef,
+                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);
 //inline functions
 inline void SetMvWithinIntegerMvRange( const int32_t kiMbWidth, const int32_t kiMbHeight, const int32_t kiMbX, const int32_t kiMbY,
                         const int32_t kiMaxMvRange,
@@ -250,6 +259,15 @@
 {
   return (CheckMvInRange(ksCurrentMv.iMvX, ksMinMv.iMvX, ksMaxMv.iMvX)
     && CheckMvInRange(ksCurrentMv.iMvY, ksMinMv.iMvY, ksMaxMv.iMvY));
+}
+//FME switch related
+inline bool CalcFMESwitchFlag(const uint8_t uiFMEGoodFrameCount, const int32_t iHighFreMbPrecentage,
+       const int32_t iAvgMbSAD, const bool bScrollingDetected ) {
+  return ( bScrollingDetected ||( uiFMEGoodFrameCount>0 && iAvgMbSAD > FMESWITCH_MBSAD_THRESHOLD ) );
+  //TODO: add the logic of iHighFreMbPrecentage
+  //return ( iHighFreMbPrecentage > 2
+  //            && ( bScrollingDetected || iHighFreMbPrecentage >15
+  //            ||( uiFMEGoodFrameCount>0 && iFrameSAD > FMESWITCH_FRAMESAD_THRESHOLD ) ) );
 }
 }
 #endif
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -708,7 +708,7 @@
   iDlayerCount	= pParam->iSpatialLayerNum;
   iNumRef	= pParam->iNumRefFrame;
 
-  const int32_t kiFeatureStrategyIndex = 0;
+  const int32_t kiFeatureStrategyIndex = FME_DEFAULT_FEATURE_INDEX;
   const int32_t kiMe16x16 = ME_DIA_CROSS;
   const int32_t kiMe8x8 = ME_DIA_CROSS_FME;
   const int32_t kiNeedFeatureStorage = (pParam->iUsageType != SCREEN_CONTENT_REAL_TIME)?0:
@@ -733,7 +733,7 @@
     pRefList		= (SRefList*)pMa->WelsMallocz (sizeof (SRefList), "pRefList");
     WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pRefList), FreeMemorySvc (ppCtx))
     do {
-      pRefList->pRef[i]	= AllocPicture (pMa, kiWidth, kiHeight, true, kiNeedFeatureStorage);	// to use actual size of current layer
+      pRefList->pRef[i]	= AllocPicture (pMa, kiWidth, kiHeight, true, (iDlayerIndex == iDlayerCount-1)?kiNeedFeatureStorage:0);	// to use actual size of current layer
       WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pRefList->pRef[i]), FreeMemorySvc (ppCtx))
       ++ i;
     } while (i < 1 + iNumRef);
@@ -830,6 +830,18 @@
       }
     }
 
+    //
+    if (kiNeedFeatureStorage && iDlayerIndex==iDlayerCount-1)
+    {
+      pDqLayer->pFeatureSearchPreparation	= static_cast<SFeatureSearchPreparation*> (pMa->WelsMallocz (sizeof (SFeatureSearchPreparation), "pFeatureSearchPreparation"));
+      WELS_VERIFY_RETURN_PROC_IF (1, NULL==pDqLayer->pFeatureSearchPreparation, FreeMemorySvc (ppCtx));
+      int32_t iReturn = RequestFeatureSearchPreparation(pMa, pDlayer->iFrameWidth, pDlayer->iFrameHeight, kiNeedFeatureStorage,
+        pDqLayer->pFeatureSearchPreparation);
+      WELS_VERIFY_RETURN_PROC_IF (1, ENC_RETURN_SUCCESS!=iReturn, FreeMemorySvc (ppCtx));
+    } else {
+      pDqLayer->pFeatureSearchPreparation = NULL;
+    }
+
     (*ppCtx)->ppDqLayerList[iDlayerIndex]	= pDqLayer;
 
     ++ iDlayerIndex;
@@ -1572,6 +1584,12 @@
             pDq->pLastMbIdxOfPartition = NULL;
           }
 
+          if (pDq->pFeatureSearchPreparation) {
+            ReleaseFeatureSearchPreparation(pMa, pDq->pFeatureSearchPreparation->pFeatureOfBlock);
+            pMa->WelsFree (pDq->pFeatureSearchPreparation, "pFeatureSearchPreparation");
+            pDq->pFeatureSearchPreparation = NULL;
+          }
+
           pMa->WelsFree (pDq, "pDq");
           pDq = NULL;
           pCtx->ppDqLayerList[ilayer] = NULL;
@@ -2427,8 +2445,41 @@
       pFuncList->pfCalculateSatd = CalculateSatdCost;
       pFuncList->pfInterFineMd = WelsMdInterFinePartition;
     }
+  }
 
+  //to init at each frame will be needed when dealing with hybrid content (camera+screen)
+  if (pCtx->pSvcParam->iUsageType == SCREEN_CONTENT_REAL_TIME) {
+    SFeatureSearchPreparation* pFeatureSearchPreparation = pCurLayer->pFeatureSearchPreparation;
+    if (pFeatureSearchPreparation) {
+      pFeatureSearchPreparation->iHighFreMbCount = 0;
 
+      if (P_SLICE == pCtx->eSliceType) {
+        //calculate bFMESwitchFlag
+        SVAAFrameInfoExt *pVaaExt		= static_cast<SVAAFrameInfoExt *>(pCtx->pVaa);
+        const int32_t kiMbSize = pCurLayer->iMbHeight*pCurLayer->iMbWidth;
+        pFeatureSearchPreparation->bFMESwitchFlag = CalcFMESwitchFlag( pFeatureSearchPreparation->uiFMEGoodFrameCount,
+          pFeatureSearchPreparation->iHighFreMbCount*100/kiMbSize, pCtx->pVaa->sVaaCalcInfo.iFrameSad/kiMbSize,
+          pVaaExt->sScrollDetectInfo.bScrollDetectFlag);
+
+        //PerformFMEPreprocess
+        SScreenBlockFeatureStorage* pScreenBlockFeatureStorage = pCurLayer->pRefPic->pScreenBlockFeatureStorage;
+        pFeatureSearchPreparation->pRefBlockFeature = pScreenBlockFeatureStorage;
+        if (pFeatureSearchPreparation->bFMESwitchFlag
+          && !pScreenBlockFeatureStorage->bRefBlockFeatureCalculated) {
+            pScreenBlockFeatureStorage->pFeatureOfBlockPointer = pFeatureSearchPreparation->pFeatureOfBlock;
+            PerformFMEPreprocess( pFuncList, pCurLayer->pRefPic, pScreenBlockFeatureStorage );
+        }
+
+        //assign ME pointer
+        if (pScreenBlockFeatureStorage->bRefBlockFeatureCalculated) {
+          //TBC int32_t iIs16x16 = pScreenBlockFeatureStorage->iIs16x16;
+        }
+      } else {
+        //reset some status when at I_SLICE
+        pFeatureSearchPreparation->bFMESwitchFlag = true;
+        pFeatureSearchPreparation->uiFMEGoodFrameCount = FME_DEFAULT_GOOD_FRAME_NUM;
+      }
+    }
   }
 }
 
--- a/codec/encoder/core/src/picture_handle.cpp
+++ b/codec/encoder/core/src/picture_handle.cpp
@@ -113,7 +113,6 @@
     pPic->pScreenBlockFeatureStorage = static_cast<SScreenBlockFeatureStorage*> (pMa->WelsMallocz (sizeof (SScreenBlockFeatureStorage), "pScreenBlockFeatureStorage"));
     int32_t iReturn = RequestScreenBlockFeatureStorage(pMa, kiWidth,  kiHeight, iNeedFeatureStorage,
       pPic->pScreenBlockFeatureStorage );
-
     WELS_VERIFY_RETURN_PROC_IF (NULL, ENC_RETURN_SUCCESS != iReturn, FreePicture (pMa, &pPic));
   } else {
     pPic->pScreenBlockFeatureStorage = NULL;
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -45,6 +45,18 @@
 
 namespace WelsSVCEnc {
 
+const int32_t QStepx16ByQp[52] = {  /* save QStep<<4 for int32_t */
+    10,  11,  13,  14,  16,  18,  /* 0~5   */
+    20,  22,  26,  28,  32,  36,  /* 6~11  */
+    40,  44,  52,  56,  64,  72,  /* 12~17 */
+    80,  88,  104, 112, 128, 144, /* 18~23 */
+    160, 176, 208, 224, 256, 288, /* 24~29 */
+    320, 352, 416, 448, 512, 576, /* 30~35 */
+    640, 704, 832, 896, 1024, 1152, /* 36~41 */
+    1280, 1408, 1664, 1792, 2048, 2304, /* 42~47 */
+    2560, 2816, 3328, 3584     /* 48~51 */
+};
+
 static inline void UpdateMeResults( const SMVUnitXY ksBestMv, const uint32_t kiBestSadCost, uint8_t* pRef, SWelsME * pMe )
 {
   pMe->sMv = ksBestMv;
@@ -313,9 +325,9 @@
 #if defined (X86_ASM)
 void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
 {
-  uint16_t *pBaseCost		= pMvdCost;
-  const int32_t kiOffset	= (kiStartMv<<2);
-  uint16_t *pMvd		= pMvdTable+kiOffset;
+  uint16_t *pBaseCost  = pMvdCost;
+  const int32_t kiOffset = (kiStartMv<<2);
+  uint16_t *pMvd  = pMvdTable+kiOffset;
   for (int32_t i = 0; i < 8; ++ i) {
     pBaseCost[i] = ((*pMvd) + kiFixedCost);
     pMvd += 4;
@@ -327,58 +339,58 @@
                           const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch ) {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
-  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
-  uint8_t*  kpEncMb	= pMe->pEncMb;
-  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixY;
-  uint8_t* pRef			      = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
-  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;
-  const int32_t kiEdgeBlocks	= kIsBlock16x16 ? 16 : 8;
+  SWelsME *pMe                        = static_cast<SWelsME *>(vpMe);
+  uint8_t*  kpEncMb = pMe->pEncMb;
+  const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY;
+  uint8_t* pRef         = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
+  const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
+  const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8;
   PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
   PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
-  PTransposeMatrixBlockFunc	TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
-  PTransposeMatrixBlocksFunc	TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
+  PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
+  PTransposeMatrixBlocksFunc TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
 
-  const int32_t kiDiff			= kiMaxPos - kiMinPos;
-  const int32_t kiRowNum		= WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
-  const int32_t kiBlocksNum		= kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
-  int32_t iCountLoop8		= (kiRowNum-kiEdgeBlocks) >> 3;
-  const int32_t kiRemainingVectors		= kiDiff - (iCountLoop8<<3);
-  const int32_t kiMatrixStride		= MAX_VERTICAL_MV_RANGE;
-  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 );	// transpose matrix result for ref
-  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 );				// transpose matrix result for enc
-  assert(kiRowNum <= kiMatrixStride);	// make sure effective memory
+  const int32_t kiDiff   = kiMaxPos - kiMinPos;
+  const int32_t kiRowNum  = WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
+  const int32_t kiBlocksNum  = kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
+  int32_t iCountLoop8  = (kiRowNum-kiEdgeBlocks) >> 3;
+  const int32_t kiRemainingVectors  = kiDiff - (iCountLoop8<<3);
+  const int32_t kiMatrixStride  = MAX_VERTICAL_MV_RANGE;
+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 ); // transpose matrix result for ref
+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 );    // transpose matrix result for enc
+  assert(kiRowNum <= kiMatrixStride); // make sure effective memory
 
   TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
   TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
   ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
-  int32_t iTargetPos			= kiMinPos;
-  int16_t iBestPos				= pMe->sMv.iMvX;
-  uint32_t uiBestCost			= pMe->uiSadCost;
+  int32_t iTargetPos   = kiMinPos;
+  int16_t iBestPos    = pMe->sMv.iMvX;
+  uint32_t uiBestCost   = pMe->uiSadCost;
   uint32_t uiCostMin;
   int32_t iIndexMinPos;
-  kpEncMb	= &uiMatrixEnc[0][0];
-  pRef	= &uiMatrixRef[0][0];
+  kpEncMb = &uiMatrixEnc[0][0];
+  pRef = &uiMatrixRef[0][0];
 
   while(iCountLoop8 > 0) {
     CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
     uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
     if (uiCostMin < uiBestCost) {
-      uiBestCost	= uiCostMin;
-      iBestPos		= iTargetPos+iIndexMinPos;
+      uiBestCost = uiCostMin;
+      iBestPos  = iTargetPos+iIndexMinPos;
     }
-    iTargetPos	+= 8;
+    iTargetPos += 8;
     pRef += 8;
     -- iCountLoop8;
   }
   if (kiRemainingVectors > 0) {
-    kpEncMb	= pMe->pEncMb;
-    pRef	= &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
+    kpEncMb = pMe->pEncMb;
+    pRef = &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
     while (iTargetPos < kiMaxPos) {
-      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];
-      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+      const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
+      uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
       if (uiSadCost < uiBestCost) {
-        uiBestCost	= uiSadCost;
-        iBestPos	= iTargetPos;
+        uiBestCost = uiSadCost;
+        iBestPos = iTargetPos;
       }
       pRef += kiRefStride;
       ++iTargetPos;
@@ -399,20 +411,20 @@
                                       const bool bVerticalSearch )
 {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
-  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
-  uint8_t *kpEncMb	= pMe->pEncMb;
-  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixX;
-  uint8_t *pRef			      = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
-  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;
+  SWelsME *pMe                        = static_cast<SWelsME *>(vpMe);
+  uint8_t *kpEncMb = pMe->pEncMb;
+  const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixX;
+  uint8_t *pRef         = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
+  const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
   PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
   PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
   ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
-  const int32_t kiNumVector	= kiMaxPos - kiMinPos;
-  int32_t iCountLoop8	= kiNumVector >> 3;
-  const int32_t kiRemainingLoop8	= kiNumVector & 7;
-  int32_t iTargetPos			= kiMinPos;
-  int16_t iBestPos				= pMe->sMv.iMvX;
-  uint32_t uiBestCost			= pMe->uiSadCost;
+  const int32_t kiNumVector = kiMaxPos - kiMinPos;
+  int32_t iCountLoop8 = kiNumVector >> 3;
+  const int32_t kiRemainingLoop8 = kiNumVector & 7;
+  int32_t iTargetPos   = kiMinPos;
+  int16_t iBestPos    = pMe->sMv.iMvX;
+  uint32_t uiBestCost   = pMe->uiSadCost;
   uint32_t uiCostMin;
   int32_t iIndexMinPos;
 
@@ -420,20 +432,20 @@
     CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
     uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
     if (uiCostMin < uiBestCost) {
-      uiBestCost	= uiCostMin;
-      iBestPos		= iTargetPos+iIndexMinPos;
+      uiBestCost = uiCostMin;
+      iBestPos  = iTargetPos+iIndexMinPos;
     }
-    iTargetPos	+= 8;
+    iTargetPos += 8;
     pRef += 8;
     -- iCountLoop8;
   }
   if ( kiRemainingLoop8 > 0 ) {
     while (iTargetPos < kiMaxPos) {
-      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];
-      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+      const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
+      uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
       if (uiSadCost < uiBestCost) {
-        uiBestCost	= uiSadCost;
-        iBestPos	= iTargetPos;
+        uiBestCost = uiSadCost;
+        iBestPos = iTargetPos;
       }
       ++pRef;
       ++iTargetPos;
@@ -447,10 +459,10 @@
   }
 }
 #endif
-void LineFullSearch_c(	void *pFunc, void *vpMe,
-													uint16_t* pMvdTable, const int32_t kiFixedMvd,
-													const int32_t kiEncStride, const int32_t kiRefStride,
-													const int32_t kiMinPos, const int32_t kiMaxPos,
+void LineFullSearch_c( void *pFunc, void *vpMe,
+             uint16_t* pMvdTable, const int32_t kiFixedMvd,
+             const int32_t kiEncStride, const int32_t kiRefStride,
+             const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch ) {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
   SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
@@ -482,9 +494,9 @@
 }
 
 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SWelsME * pMe,
-											const SSlice* pSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {
-  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfVerticalFullSearch;
-  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfHorizontalFullSearch;
+           const SSlice* pSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {
+  PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch;
+  PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch;
 
   const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
   const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
@@ -515,9 +527,10 @@
 // Feature Search Basics
 /////////////////////////
 //memory related
-int32_t RequestFeatureSearchPreparation( CMemoryAlign *pMa, const int32_t kiFeatureStrategyIndex,
-                                         const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const bool bFme8x8,
-                                         uint16_t*& pFeatureOfBlock) {
+int32_t RequestFeatureSearchPreparation( CMemoryAlign *pMa, const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const int32_t iNeedFeatureStorage,
+                                        SFeatureSearchPreparation* pFeatureSearchPreparation) {
+  const int32_t kiFeatureStrategyIndex = iNeedFeatureStorage>>16;
+  const bool bFme8x8 = ((iNeedFeatureStorage & 0x0000FF & ME_FME)==ME_FME);
   const int32_t kiMarginSize = bFme8x8?8:16;
   const int32_t kiFrameSize = (kiFrameWidth-kiMarginSize) * (kiFrameHeight-kiMarginSize);
   int32_t iListOfFeatureOfBlock;
@@ -528,10 +541,15 @@
     iListOfFeatureOfBlock = sizeof(uint16_t) * kiFrameSize +
       (kiFrameWidth-kiMarginSize) * sizeof(uint32_t) + kiFrameWidth * 8 * sizeof(uint8_t);
   }
-  pFeatureOfBlock =
+  pFeatureSearchPreparation->pFeatureOfBlock =
     (uint16_t *)pMa->WelsMalloc(iListOfFeatureOfBlock, "pFeatureOfBlock");
-  WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pFeatureOfBlock)
+  WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == (pFeatureSearchPreparation->pFeatureOfBlock) )
 
+  pFeatureSearchPreparation->uiFeatureStrategyIndex = kiFeatureStrategyIndex;
+  pFeatureSearchPreparation->bFMESwitchFlag = true;
+  pFeatureSearchPreparation->uiFMEGoodFrameCount = FME_DEFAULT_GOOD_FRAME_NUM;
+  pFeatureSearchPreparation->iHighFreMbCount = 0;
+
   return ENC_RETURN_SUCCESS;
 }
 int32_t ReleaseFeatureSearchPreparation( CMemoryAlign *pMa, uint16_t*& pFeatureOfBlock) {
@@ -568,7 +586,13 @@
   pScreenBlockFeatureStorage->pLocationPointer = (uint16_t*)pMa->WelsMalloc(2*kiFrameSize*sizeof(uint16_t), "pScreenBlockFeatureStorage->pLocationPointer");
   WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationPointer)
 
-  pScreenBlockFeatureStorage->iActualListSize  = kiListSize;
+  pScreenBlockFeatureStorage->pFeatureOfBlockPointer = NULL;
+  pScreenBlockFeatureStorage->iIs16x16 = !bIsBlock8x8;
+  pScreenBlockFeatureStorage->uiFeatureStrategyIndex = kiFeatureStrategyIndex;
+  pScreenBlockFeatureStorage->iActualListSize = kiListSize;
+  memset(pScreenBlockFeatureStorage->uiSadCostThreshold, UINT_MAX, BLOCK_SIZE_ALL*sizeof(uint32_t));
+  pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = false;
+
   return ENC_RETURN_SUCCESS;
 }
 int32_t ReleaseScreenBlockFeatureStorage( CMemoryAlign *pMa, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage ) {
@@ -588,11 +612,9 @@
 }
 
 //preprocess related
-int32_t SumOf8x8SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride)
-{
+int32_t SumOf8x8SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride) {
   int32_t iSum = 0, i;
-  for(i = 0; i < 8; i++)
-  {
+  for(i = 0; i < 8; i++) {
     iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
     iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
     pRef += kiRefStride;
@@ -599,11 +621,9 @@
   }
   return iSum;
 }
-int32_t SumOf16x16SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride)
-{
+int32_t SumOf16x16SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride) {
   int32_t iSum = 0, i;
-  for(i = 0; i < 16; i++)
-  {
+  for(i = 0; i < 16; i++) {
     iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
     iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
     iSum    +=  pRef[8]    + pRef[9]  + pRef[10]  + pRef[11];
@@ -681,6 +701,7 @@
     pSrcPointer += kiWidth;
   }
 }
+
 void CalculateFeatureOfBlock( SWelsFuncPtrList *pFunc, SPicture* pRef,
                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage)
 {
@@ -710,10 +731,17 @@
 }
 
 void PerformFMEPreprocess( SWelsFuncPtrList *pFunc, SPicture* pRef,
-                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage)
-{
+                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
     CalculateFeatureOfBlock(pFunc, pRef, pScreenBlockFeatureStorage );
     pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = true;
+
+    uint32_t uiRefPictureAvgQstepx16 = QStepx16ByQp[WelsMedian(0, pRef->iFrameAverageQp, 51)];
+    uint32_t uiSadCostThreshold16x16 = ((30 * (uiRefPictureAvgQstepx16 + 160))>>3);
+    pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x16] = uiSadCostThreshold16x16;
+    pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x8] = (uiSadCostThreshold16x16>>2);
+    pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_16x8]
+    = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_8x16]
+    = pScreenBlockFeatureStorage->uiSadCostThreshold[BLOCK_4x4] = UINT_MAX;
 }
 
 //search related