shithub: openh264

Download patch

ref: 29f300dde9262c4ea9784ea90649c836eeec7089
parent: c12edefcd371fcd64d6985d17404ac600e4f2d39
author: sijchen <sijchen@cisco.com>
date: Wed Apr 2 10:34:23 EDT 2014

[Encoder ME] add Preprocess functions for FME

--- a/codec/encoder/core/inc/picture.h
+++ b/codec/encoder/core/inc/picture.h
@@ -30,7 +30,7 @@
  *
  */
 
-//picture.h	-	reconstruction picture/ reference picture/ residual picture are declared here
+//picture.h  -  reconstruction picture/ reference picture/ residual picture are declared here
 #ifndef WELS_PICTURE_H__
 #define WELS_PICTURE_H__
 
@@ -39,84 +39,89 @@
 #include "wels_common_basis.h"
 
 namespace WelsSVCEnc {
-#define LIST_SIZE			0x10000		//(256*256)
+#define LIST_SIZE      0x10000    //(256*256)
 typedef struct TagScreenBlockFeatureStorage
 {
-	uint32_t*	pTimesOfFeatureValue;		// times of every value in Feature
-	uint16_t**	pLocationOfFeature;			// uint16_t *pLocationOfFeature[LIST_SIZE], pLocationOfFeature[i] saves all the location(x,y) whose Feature = i;
-	uint16_t*	pLocationPointer;	// buffer of position array
-  int32_t		iActualListSize;			// actual list size
+  //Input
+  uint16_t*  pFeatureOfBlockPointer;    // Pointer to pFeatureOfBlock
+  int32_t    iIs16x16;      //Feature block size
+  uint8_t      uiFeatureStrategyIndex;// index of hash strategy
 
+  //Modify
+  uint32_t*  pTimesOfFeatureValue;    // times of every value in Feature
+  uint16_t**  pLocationOfFeature;      // uint16_t *pLocationOfFeature[LIST_SIZE], pLocationOfFeature[i] saves all the location(x,y) whose Feature = i;
+  uint16_t*  pLocationPointer;  // buffer of position array
+  int32_t    iActualListSize;      // actual list size
   uint32_t uiSadCostThreshold[BLOCK_SIZE_ALL];
-  bool						bRefBlockFeatureCalculated; // flag of whether pre-process is done
+  bool      bRefBlockFeatureCalculated; // flag of whether pre-process is done
 } SScreenBlockFeatureStorage; //should be stored with RefPic, one for each frame
 
 typedef struct TagFeatureSearchPreparation{
-  SScreenBlockFeatureStorage*	pRefBlockFeature;//point the the ref frame storage
+  SScreenBlockFeatureStorage*  pRefBlockFeature;//point the the ref frame storage
 
-  uint16_t*	pFeatureOfBlock;		// Feature of every block (8x8), begin with the point
-	uint8_t      uiFeatureStrategyIndex;// index of hash strategy
+  uint16_t*  pFeatureOfBlock;    // Feature of every block (8x8), begin with the point
+  uint8_t      uiFeatureStrategyIndex;// index of hash strategy
 
-	/* for FME frame-level switch */
-	bool bFMESwitchFlag;
-	uint8_t uiFMEGoodFrameCount;
-	int32_t iHighFreMbCount;
+  /* for FME frame-level switch */
+  bool bFMESwitchFlag;
+  uint8_t uiFMEGoodFrameCount;
+  int32_t iHighFreMbCount;
 }SFeatureSearchPreparation;//maintain only one
 
 
 /*
- *	Reconstructed Picture definition
- *	It is used to express reference picture, also consequent reconstruction picture for output
+ *  Reconstructed Picture definition
+ *  It is used to express reference picture, also consequent reconstruction picture for output
  */
 typedef struct TagPicture {
   /************************************payload pData*********************************/
-  uint8_t*		pBuffer;		// pointer to the first allocated byte, basical offset of pBuffer, dimension:
-  uint8_t*		pData[3];		// pointer to picture planes respectively
-  int32_t		iLineSize[3];	// iLineSize of picture planes respectively
+  uint8_t*    pBuffer;    // pointer to the first allocated byte, basical offset of pBuffer, dimension:
+  uint8_t*    pData[3];    // pointer to picture planes respectively
+  int32_t    iLineSize[3];  // iLineSize of picture planes respectively
 
   // picture information
   /*******************************from other standard syntax****************************/
   /*from pSps*/
-  int32_t		iWidthInPixel;	// picture width in pixel
-  int32_t		iHeightInPixel;// picture height in pixel
-  int32_t		iPictureType;	// got from sSliceHeader(): eSliceType
-  int32_t		iFramePoc;		// frame POC
+  int32_t    iWidthInPixel;  // picture width in pixel
+  int32_t    iHeightInPixel;// picture height in pixel
+  int32_t    iPictureType;  // got from sSliceHeader(): eSliceType
+  int32_t    iFramePoc;    // frame POC
 
-  float			fFrameRate;   // MOVE
-  int32_t		iFrameNum;		// frame number			//for pRef pic management
+  float      fFrameRate;   // MOVE
+  int32_t    iFrameNum;    // frame number      //for pRef pic management
 
-  uint32_t*	uiRefMbType;	// for iMbWidth*iMbHeight
-  uint8_t*		pRefMbQp;		// for iMbWidth*iMbHeight
+  uint32_t*  uiRefMbType;  // for iMbWidth*iMbHeight
+  uint8_t*    pRefMbQp;    // for iMbWidth*iMbHeight
 
   int32_t*     pMbSkipSad;   //for iMbWidth*iMbHeight
 
-  SMVUnitXY*	sMvList;
+  SMVUnitXY*  sMvList;
 
   /*******************************sef_definition for misc use****************************/
-  int32_t		iMarkFrameNum;
-  int32_t		iLongTermPicNum;
+  int32_t    iMarkFrameNum;
+  int32_t    iLongTermPicNum;
 
-  bool		bUsedAsRef;						//for pRef pic management
-  bool		bIsLongRef;	// long term reference frame flag	//for pRef pic management
+  bool    bUsedAsRef;            //for pRef pic management
+  bool    bIsLongRef;  // long term reference frame flag  //for pRef pic management
   bool    bIsSceneLTR;  //long term reference & large scene change
-  uint8_t		uiRecieveConfirmed;
-  uint8_t		uiTemporalId;
-  uint8_t		uiSpatialId;
+  uint8_t    uiRecieveConfirmed;
+  uint8_t    uiTemporalId;
+  uint8_t    uiSpatialId;
   int32_t   iFrameAverageQp;
 } SPicture;
 
 /*
- *	Residual Picture
+ *  Residual Picture
  */
 //typedef struct Rs_Picture_s{
-//	int16_t		*pBuffer[4];		// base pBuffer
-//	int16_t		*pData[4];		// pData pBuffer
-//	int32_t		real_linesize[4];// actual iLineSize of picture planes respectively
-//	int32_t		used_linesize[4];// iLineSize of picture planes respectively used currently
-//	int32_t		planes;			// planes of YUV
+//  int16_t    *pBuffer[4];    // base pBuffer
+//  int16_t    *pData[4];    // pData pBuffer
+//  int32_t    real_linesize[4];// actual iLineSize of picture planes respectively
+//  int32_t    used_linesize[4];// iLineSize of picture planes respectively used currently
+//  int32_t    planes;      // planes of YUV
 //}Rs_Picture_t;
 
-}	// end of namespace WelsSVCEnc {
+}  // end of namespace WelsSVCEnc {
 
 #endif//WELS_PICTURE_H__
 
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -29,11 +29,11 @@
  *     POSSIBILITY OF SUCH DAMAGE.
  *
  *
- * \file	svc motion estimate.h
+ * \file  svc motion estimate.h
  *
- * \brief	Interfaces introduced in svc mb motion estimation
+ * \brief  Interfaces introduced in svc mb motion estimation
  *
- * \date	08/11/2009 Created
+ * \date  08/11/2009 Created
  *
  *************************************************************************************
  */
@@ -46,10 +46,10 @@
 
 namespace WelsSVCEnc {
 #define CAMERA_STARTMV_RANGE (64)
-#define	ITERATIVE_TIMES	(16)
+#define  ITERATIVE_TIMES  (16)
 #define CAMERA_MV_RANGE (CAMERA_STARTMV_RANGE+ITERATIVE_TIMES)
 #define CAMERA_MVD_RANGE  ((CAMERA_MV_RANGE+1)<<1) //mvd=mv_range*2;
-#define	BASE_MV_MB_NMB	((2*CAMERA_MV_RANGE/MB_WIDTH_LUMA)-1)
+#define  BASE_MV_MB_NMB  ((2*CAMERA_MV_RANGE/MB_WIDTH_LUMA)-1)
 #define CAMERA_HIGHLAYER_MVD_RANGE (243)//mvd range;
 #define EXPANDED_MV_RANGE (504) //=512-8 rather than 511 to sacrifice same edge point but save complexity in assemblys
 #define EXPANDED_MVD_RANGE ((504+1)<<1)
@@ -56,42 +56,42 @@
 
 enum
 {
-  ME_DIA		= 0x01,	// LITTLE DIAMOND= 0x01
-  ME_CROSS	= 0x02,	// CROSS=  0x02
-  ME_FME		= 0x04,	// FME = 0x04
-  ME_FULL		= 0x10,	// FULL
+  ME_DIA    = 0x01,  // LITTLE DIAMOND= 0x01
+  ME_CROSS  = 0x02,  // CROSS=  0x02
+  ME_FME    = 0x04,  // FME = 0x04
+  ME_FULL    = 0x10,  // FULL
 
   // derived ME methods combination
-  ME_DIA_CROSS		=	(ME_DIA|ME_CROSS),		// DIA+CROSS
-  ME_DIA_CROSS_FME	=	(ME_DIA_CROSS|ME_FME),	// DIA+CROSS+FME
+  ME_DIA_CROSS    =  (ME_DIA|ME_CROSS),    // DIA+CROSS
+  ME_DIA_CROSS_FME  =  (ME_DIA_CROSS|ME_FME),  // DIA+CROSS+FME
 };
 
 union SadPredISatdUnit {
-uint32_t	uiSadPred;
-uint32_t	uiSatd;    //reuse the sad_pred as a temp satd pData
+uint32_t  uiSadPred;
+uint32_t  uiSatd;    //reuse the sad_pred as a temp satd pData
 };
 typedef struct TagWelsME {
 /* input */
-uint16_t*					pMvdCost;
-union SadPredISatdUnit	uSadPredISatd; //reuse the sad_pred as a temp pData
-uint32_t					uiSadCost;  //used by ME and RC //max SAD should be max_delta*size+lambda*mvdsize = 255*256+91*33*2 = 65280 + 6006 = 71286 > (2^16)-1 = 65535
-uint32_t					uiSatdCost; /* satd + lm * nbits */
-uint32_t					uiSadCostThreshold;
-int32_t						iCurMeBlockPixX;
-int32_t						iCurMeBlockPixY;
-uint8_t						uiBlockSize;   /* BLOCK_WxH */
-uint8_t						uiReserved;
+uint16_t*          pMvdCost;
+union SadPredISatdUnit  uSadPredISatd; //reuse the sad_pred as a temp pData
+uint32_t          uiSadCost;  //used by ME and RC //max SAD should be max_delta*size+lambda*mvdsize = 255*256+91*33*2 = 65280 + 6006 = 71286 > (2^16)-1 = 65535
+uint32_t          uiSatdCost; /* satd + lm * nbits */
+uint32_t          uiSadCostThreshold;
+int32_t            iCurMeBlockPixX;
+int32_t            iCurMeBlockPixY;
+uint8_t            uiBlockSize;   /* BLOCK_WxH */
+uint8_t            uiReserved;
 
-uint8_t*						pEncMb;
-uint8_t*						pRefMb;
-uint8_t*						pColoRefMb;
+uint8_t*            pEncMb;
+uint8_t*            pRefMb;
+uint8_t*            pColoRefMb;
 
-SMVUnitXY					sMvp;
-SMVUnitXY					sMvBase;
-SMVUnitXY					sDirectionalMv;
+SMVUnitXY          sMvp;
+SMVUnitXY          sMvBase;
+SMVUnitXY          sDirectionalMv;
 
 /* output */
-SMVUnitXY					sMv;
+SMVUnitXY          sMv;
 } SWelsME;
 
 typedef struct TagFeatureSearchIn{
@@ -134,37 +134,37 @@
 void WelsInitMeFunc( SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScreenContent );
 
 /*!
- * \brief	BL mb motion estimate search
+ * \brief  BL mb motion estimate search
  *
- * \param	enc			Wels encoder context
- * \param	m	        Wels me information
+ * \param  enc      Wels encoder context
+ * \param  m          Wels me information
  *
- * \return	NONE
+ * \return  NONE
  */
 void WelsMotionEstimateSearch (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice);
 
 
 /*!
- * \brief	BL mb motion estimate initial point testing
+ * \brief  BL mb motion estimate initial point testing
  *
- * \param	enc			Wels encoder context
- * \param	m	        Wels me information
- * \param	mv_range	search range in motion estimate
- * \param	point	    the best match point in motion estimation
+ * \param  enc      Wels encoder context
+ * \param  m          Wels me information
+ * \param  mv_range  search range in motion estimate
+ * \param  point      the best match point in motion estimation
  *
- * \return	NONE
+ * \return  NONE
  */
 
 
 /*!
- * \brief	EL mb motion estimate initial point testing
+ * \brief  EL mb motion estimate initial point testing
  *
- * \param	pix_func	SSampleDealingFunc
- * \param	m	        Wels me information
- * \param	mv_range	search range in motion estimate
- * \param	point	    the best match point in motion estimation
+ * \param  pix_func  SSampleDealingFunc
+ * \param  m          Wels me information
+ * \param  mv_range  search range in motion estimate
+ * \param  point      the best match point in motion estimation
  *
- * \return	NONE
+ * \return  NONE
  */
 
 bool WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice,
@@ -171,13 +171,13 @@
                                      const int32_t kiStrideEnc, const int32_t kiStrideRef);
 
 /*!
- * \brief	mb iterative motion estimate search
+ * \brief  mb iterative motion estimate search
  *
- * \param	enc			Wels encoder context
- * \param	m	        Wels me information
- * \param	point	    the best match point in motion estimation
+ * \param  enc      Wels encoder context
+ * \param  m          Wels me information
+ * \param  point      the best match point in motion estimation
  *
- * \return	NONE
+ * \return  NONE
  */
 void WelsDiamondSearch (SWelsFuncPtrList* pFuncList, void* pLpme, void* pLpslice, const int32_t kiEncStride, const int32_t kiRefStride);
 
@@ -193,18 +193,30 @@
                       const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
                       int32_t& iBestSadCost);
 
-void LineFullSearch_c(	 void *pFunc, void *vpMe,
+// Cross Search Basics
+void LineFullSearch_c(   void *pFunc, void *vpMe,
                         uint16_t* pMvdTable, const int32_t kiFixedMvd,
                         const int32_t kiEncStride, const int32_t kiRefStride,
                         const int32_t kiMinPos, const int32_t kiMaxPos,
                         const bool bVerticalSearch );
 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
-														uint16_t* pMvdTable, const int32_t kiFixedMvd,
-														const int32_t kiEncStride, const int32_t kiRefStride,
-													const int32_t kiMinPos, const int32_t kiMaxPos,
+                            uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                            const int32_t kiEncStride, const int32_t kiRefStride,
+                          const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch );
 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);
 
+// Feature Search Basics
+#define LIST_SIZE_SUM_16x16  0x0FF01    //(256*255+1)
+#define LIST_SIZE_SUM_8x8      0x03FC1    //(64*255+1)
+#define LIST_SIZE_MSE_16x16  0x00878    //(avg+mse)/2, max= (255+16*255)/2
+int32_t SumOf8x8SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride);
+int32_t SumOf16x16SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride);
+void SumOf8x8BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+void SumOf16x16BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+//inline functions
 inline void SetMvWithinIntegerMvRange( const int32_t kiMbWidth, const int32_t kiMbHeight, const int32_t kiMbX, const int32_t kiMbY,
                         const int32_t kiMaxMvRange,
                         SMVUnitXY* pMvMin, SMVUnitXY* pMvMax)
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -141,11 +141,14 @@
 typedef bool (*PCheckDirectionalMv) (PSampleSadSatdCostFunc pSad, void * vpMe,
                       const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
                       int32_t& iBestSadCost);
-typedef void (*PLineFullSearchFunc) (	void *pFunc, void *vpMe,
-													uint16_t* pMvdTable, const int32_t kiFixedMvd,
-													const int32_t kiEncStride, const int32_t kiRefStride,
-													const int32_t kiMinPos, const int32_t kiMaxPos,
+typedef void (*PLineFullSearchFunc) (  void *pFunc, void *vpMe,
+                          uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                          const int32_t kiEncStride, const int32_t kiRefStride,
+                          const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch );
+typedef void (*PCalculateBlockFeatureOfFrame)(uint8_t *pRef, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
+typedef int32_t (*PCalculateSingleBlockFeature)(uint8_t *pRef, const int32_t kiRefStride);
 
 #define     MAX_BLOCK_TYPE 5 // prev 7
 typedef struct TagSampleDealingFunc {
@@ -175,15 +178,15 @@
 typedef bool (*PUpdateRefListFunc) (void* pCtx);
 
 struct TagWelsFuncPointerList {
-  PExpandPictureFunc			pfExpandLumaPicture;
+  PExpandPictureFunc      pfExpandLumaPicture;
   PExpandPictureFunc
   pfExpandChromaPicture[2];// 0: for chroma unalignment && width_uv >= 16; 1: for chroma alignment && width_uv >= 16;
 
   PFillInterNeighborCacheFunc       pfFillInterNeighborCache;
 
-  PGetVarianceFromIntraVaaFunc	pfGetVarianceFromIntraVaa;
-  PGetMbSignFromInterVaaFunc	pfGetMbSignFromInterVaa;
-  PUpdateMbMvFunc					    pfUpdateMbMv;
+  PGetVarianceFromIntraVaaFunc  pfGetVarianceFromIntraVaa;
+  PGetMbSignFromInterVaaFunc  pfGetMbSignFromInterVaa;
+  PUpdateMbMvFunc              pfUpdateMbMv;
   PInterMdFirstIntraModeFunc      pfFirstIntraMode; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
   PIntraFineMdFunc
   pfIntraFineMd;          //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
@@ -193,11 +196,11 @@
   PInterMdBackgroundDecisionFunc          pfInterMdBackgroundDecision;
   PInterMdBackgroundInfoUpdateFunc      pfInterMdBackgroundInfoUpdate;
 
-  SMcFunc				        sMcFuncs;
+  SMcFunc                sMcFuncs;
   SSampleDealingFunc     sSampleDealingFuncs;
-  PGetIntraPredFunc 		pfGetLumaI16x16Pred[I16_PRED_DC_A];
-  PGetIntraPredFunc 		pfGetLumaI4x4Pred[I4_PRED_A];
-  PGetIntraPredFunc 		pfGetChromaPred[C_PRED_A];
+  PGetIntraPredFunc     pfGetLumaI16x16Pred[I16_PRED_DC_A];
+  PGetIntraPredFunc     pfGetLumaI4x4Pred[I4_PRED_A];
+  PGetIntraPredFunc     pfGetChromaPred[C_PRED_A];
 
   PMotionSearchFunc
   pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
@@ -205,55 +208,57 @@
   PCalculateSatdFunc pfCalculateSatd;
   PCheckDirectionalMv pfCheckDirectionalMv;
   PLineFullSearchFunc pfLineFullSearch;
+  PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16
+  PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16
 
-  PCopyFunc      pfCopy16x16Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
-  PCopyFunc      pfCopy16x16NotAligned;	//md.c
-  PCopyFunc      pfCopy8x8Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
-  PCopyFunc	  pfCopy16x8NotAligned;	//for MeRefineFracPixel 16x8 based
-  PCopyFunc	  pfCopy8x16Aligned;		//for MeRefineFracPixel 8x16 based
+  PCopyFunc      pfCopy16x16Aligned;    //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+  PCopyFunc      pfCopy16x16NotAligned;  //md.c
+  PCopyFunc      pfCopy8x8Aligned;    //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
+  PCopyFunc    pfCopy16x8NotAligned;  //for MeRefineFracPixel 16x8 based
+  PCopyFunc    pfCopy8x16Aligned;    //for MeRefineFracPixel 8x16 based
 
   //svc_encode_mb.c encode_mb_aux.c
-  PDctFunc					pfDctT4;
-  PDctFunc    		        pfDctFourT4;
+  PDctFunc          pfDctT4;
+  PDctFunc                pfDctFourT4;
 
-  PCalculateSingleCtrFunc				pfCalculateSingleCtr4x4;
-  PScanFunc				pfScan4x4;		//DC/AC
-  PScanFunc				pfScan4x4Ac;
+  PCalculateSingleCtrFunc        pfCalculateSingleCtr4x4;
+  PScanFunc        pfScan4x4;    //DC/AC
+  PScanFunc        pfScan4x4Ac;
 
-  PQuantizationFunc				        pfQuantization4x4;
-  PQuantizationFunc				        pfQuantizationFour4x4;
-  PQuantizationDcFunc			        pfQuantizationDc4x4;
-  PQuantizationMaxFunc		        pfQuantizationFour4x4Max;
-  PQuantizationHadamardFunc		pfQuantizationHadamard2x2;
-  PQuantizationSkipFunc		        pfQuantizationHadamard2x2Skip;
+  PQuantizationFunc                pfQuantization4x4;
+  PQuantizationFunc                pfQuantizationFour4x4;
+  PQuantizationDcFunc              pfQuantizationDc4x4;
+  PQuantizationMaxFunc            pfQuantizationFour4x4Max;
+  PQuantizationHadamardFunc    pfQuantizationHadamard2x2;
+  PQuantizationSkipFunc            pfQuantizationHadamard2x2Skip;
 
-  PTransformHadamard4x4Func	 pfTransformHadamard4x4Dc;
+  PTransformHadamard4x4Func   pfTransformHadamard4x4Dc;
 
-  PGetNoneZeroCountFunc		      pfGetNoneZeroCount;
+  PGetNoneZeroCountFunc          pfGetNoneZeroCount;
 
-  PDeQuantizationFunc				      pfDequantization4x4;
-  PDeQuantizationFunc			          pfDequantizationFour4x4;
-  PDeQuantizationHadamardFunc	  pfDequantizationIHadamard4x4;
-  PIDctFunc				                      pfIDctFourT4;
-  PIDctFunc				                      pfIDctT4;
-  PIDctFunc				                      pfIDctI16x16Dc;
+  PDeQuantizationFunc              pfDequantization4x4;
+  PDeQuantizationFunc                pfDequantizationFour4x4;
+  PDeQuantizationHadamardFunc    pfDequantizationIHadamard4x4;
+  PIDctFunc                              pfIDctFourT4;
+  PIDctFunc                              pfIDctT4;
+  PIDctFunc                              pfIDctI16x16Dc;
 
 
 
   // OPTI: if MT under diff uiSliceMode, need change here
-  //PDynamicSlicingStepBackFunc	dynslc_funcpointer_stepback;//svc_encode_slice.c
-  //DYNSLC_LNGTH_CRTL		dynslc_funcpointer_slcsize_ctrl;
+  //PDynamicSlicingStepBackFunc  dynslc_funcpointer_stepback;//svc_encode_slice.c
+  //DYNSLC_LNGTH_CRTL    dynslc_funcpointer_slcsize_ctrl;
 
   /* For Deblocking */
   DeblockingFunc                         pfDeblocking;
   PSetNoneZeroCountZeroFunc     pfSetNZCZero;
 
-  SWelsRcFunc					    pfRc;
+  SWelsRcFunc              pfRc;
   PAccumulateSadFunc         pfAccumulateSadForRc;
 
-  PSetMemoryZero				pfSetMemZeroSize8;			// for size is times to 8
-  PSetMemoryZero				pfSetMemZeroSize64Aligned16;			// for size is times of 64, and address is align to 16
-  PSetMemoryZero				pfSetMemZeroSize64;			// for size is times of 64, and don't know address is align to 16 or not
+  PSetMemoryZero        pfSetMemZeroSize8;      // for size is times to 8
+  PSetMemoryZero        pfSetMemZeroSize64Aligned16;      // for size is times of 64, and address is align to 16
+  PSetMemoryZero        pfSetMemZeroSize64;      // for size is times of 64, and don't know address is align to 16 or not
 
   PBuildRefListFunc     pBuildRefList;
   PMarkPicFunc          pMarkPic;
@@ -260,6 +265,6 @@
   PUpdateRefListFunc    pUpdateRefList;
 };
 
-}	//end of namespace WelsSVCEnc {
+}  //end of namespace WelsSVCEnc {
 
 #endif//WELS_ENCODER_FUNCTION_POINTERS_DEFINITION_H_
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -29,16 +29,17 @@
  *     POSSIBILITY OF SUCH DAMAGE.
  *
  *
- * \file	svc motion estimate.c
+ * \file  svc motion estimate.c
  *
- * \brief	Interfaces introduced in svc mb motion estimation
+ * \brief  Interfaces introduced in svc mb motion estimation
  *
- * \date	08/11/2009 Created
+ * \date  08/11/2009 Created
  *
  *************************************************************************************
  */
 
 #include "cpu_core.h"
+#include "ls_defines.h"
 #include "svc_motion_estimate.h"
 
 namespace WelsSVCEnc {
@@ -67,16 +68,23 @@
     pFuncList->pfLineFullSearch = LineFullSearch_c;
     if ( uiCpuFlag & WELS_CPU_SSE41 ) {
     }
+
+    //for feature search
+    pFuncList->pfCalculateBlockFeatureOfFrame[0] = SumOf8x8BlockOfFrame_c;
+    pFuncList->pfCalculateBlockFeatureOfFrame[1] = SumOf16x16BlockOfFrame_c;
+    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
+    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
+    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
   }
 }
 
 /*!
- * \brief	BL mb motion estimate search
+ * \brief  BL mb motion estimate search
  *
- * \param	enc			Wels encoder context
- * \param	pMe	        Wels me information
+ * \param  enc      Wels encoder context
+ * \param  pMe          Wels me information
  *
- * \return	NONE
+ * \return  NONE
  */
 
 void WelsMotionEstimateSearch (SWelsFuncPtrList* pFuncList, void* pLplayer, void* pLpme, void* pLpslice) {
@@ -96,20 +104,20 @@
 }
 
 /*!
- * \brief	EL mb motion estimate initial point testing
+ * \brief  EL mb motion estimate initial point testing
  *
- * \param	pix_pFuncList	SSampleDealingFunc
- * \param	pMe	        Wels me information
- * \param	mv_range	search range in motion estimate
- * \param	point	    the best match point in motion estimation
+ * \param  pix_pFuncList  SSampleDealingFunc
+ * \param  pMe          Wels me information
+ * \param  mv_range  search range in motion estimate
+ * \param  point      the best match point in motion estimation
  *
- * \return	NONE
+ * \return  NONE
  */
 bool WelsMotionEstimateInitialPoint (SWelsFuncPtrList* pFuncList, SWelsME* pMe, SSlice* pSlice, int32_t iStrideEnc,
                                      int32_t iStrideRef) {
-  PSampleSadSatdCostFunc pSad		= pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
-  const uint16_t* kpMvdCost	= pMe->pMvdCost;
-  uint8_t* const kpEncMb		= pMe->pEncMb;
+  PSampleSadSatdCostFunc pSad    = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+  const uint16_t* kpMvdCost  = pMe->pMvdCost;
+  uint8_t* const kpEncMb    = pMe->pEncMb;
   int16_t iMvc0, iMvc1;
   int32_t iSadCost;
   int32_t iBestSadCost;
@@ -116,17 +124,17 @@
   uint8_t* pRefMb;
   uint8_t* pFref2;
   uint32_t i;
-  const uint32_t kuiMvcNum		= pSlice->uiMvcNum;
-  const SMVUnitXY* kpMvcList	= &pSlice->sMvc[0];
-  const SMVUnitXY ksMvStartMin		= pSlice->sMvStartMin;
-  const SMVUnitXY ksMvStartMax		= pSlice->sMvStartMax;
-  const SMVUnitXY ksMvp		= pMe->sMvp;
+  const uint32_t kuiMvcNum    = pSlice->uiMvcNum;
+  const SMVUnitXY* kpMvcList  = &pSlice->sMvc[0];
+  const SMVUnitXY ksMvStartMin    = pSlice->sMvStartMin;
+  const SMVUnitXY ksMvStartMax    = pSlice->sMvStartMax;
+  const SMVUnitXY ksMvp    = pMe->sMvp;
   SMVUnitXY sMv;
 
   //  Step 1: Initial point prediction
   // init with sMvp
-  sMv.iMvX	= WELS_CLIP3 ((2 + ksMvp.iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX);
-  sMv.iMvY	= WELS_CLIP3 ((2 + ksMvp.iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY);
+  sMv.iMvX  = WELS_CLIP3 ((2 + ksMvp.iMvX) >> 2, ksMvStartMin.iMvX, ksMvStartMax.iMvX);
+  sMv.iMvY  = WELS_CLIP3 ((2 + ksMvp.iMvY) >> 2, ksMvStartMin.iMvY, ksMvStartMax.iMvY);
 
   pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
 
@@ -171,7 +179,7 @@
 
 void CalculateSatdCost( PSampleSadSatdCostFunc pSatd, void * vpMe,
                        const int32_t kiEncStride, const int32_t kiRefStride ) {
-  SWelsME* pMe						 = static_cast<SWelsME *>(vpMe);
+  SWelsME* pMe             = static_cast<SWelsME *>(vpMe);
   pMe->uSadPredISatd.uiSatd = pSatd(pMe->pEncMb, kiEncStride, pMe->pRefMb, kiRefStride);
   pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD (pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX,
                                                             pMe->sMv.iMvY - pMe->sMvp.iMvY);
@@ -266,7 +274,7 @@
 bool CheckDirectionalMv(PSampleSadSatdCostFunc pSad, void * vpMe,
                       const SMVUnitXY ksMinMv, const SMVUnitXY ksMaxMv, const int32_t kiEncStride, const int32_t kiRefStride,
                       int32_t& iBestSadCost) {
-  SWelsME* pMe						 = static_cast<SWelsME *>(vpMe);
+  SWelsME* pMe             = static_cast<SWelsME *>(vpMe);
   const int16_t kiMvX = pMe->sDirectionalMv.iMvX;
   const int16_t kiMvY = pMe->sDirectionalMv.iMvY;
 
@@ -295,34 +303,34 @@
 // Cross Search Basics
 /////////////////////////
 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
-														uint16_t* pMvdTable, const int32_t kiFixedMvd,
-														const int32_t kiEncStride, const int32_t kiRefStride,
-													const int32_t kiMinPos, const int32_t kiMaxPos,
+                            uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                            const int32_t kiEncStride, const int32_t kiRefStride,
+                          const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch ) {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
-  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
+  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
 }
-void LineFullSearch_c(	void *pFunc, void *vpMe,
-													uint16_t* pMvdTable, const int32_t kiFixedMvd,
-													const int32_t kiEncStride, const int32_t kiRefStride,
-													const int32_t kiMinPos, const int32_t kiMaxPos,
+void LineFullSearch_c(  void *pFunc, void *vpMe,
+                          uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                          const int32_t kiEncStride, const int32_t kiRefStride,
+                          const int32_t kiMinPos, const int32_t kiMaxPos,
                           const bool bVerticalSearch ) {
   SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
-  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
+  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
   PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
-  const int32_t kiCurMeBlockPix	= bVerticalSearch?pMe->iCurMeBlockPixY:pMe->iCurMeBlockPixX;
+  const int32_t kiCurMeBlockPix  = bVerticalSearch?pMe->iCurMeBlockPixY:pMe->iCurMeBlockPixX;
   const int32_t kiStride = bVerticalSearch?kiRefStride:1;
-  uint8_t* pRef			      = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiStride];
+  uint8_t* pRef            = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiStride];
   uint16_t* pMvdCost  = &(pMvdTable[kiMinPos<<2]);
-  uint32_t uiBestCost	  = 0xFFFFFFFF;
-  int32_t iBestPos		   = 0;
+  uint32_t uiBestCost    = 0xFFFFFFFF;
+  int32_t iBestPos       = 0;
 
   for ( int32_t iTargetPos = kiMinPos; iTargetPos < kiMaxPos; ++ iTargetPos ) {
-    uint8_t* const kpEncMb	= pMe->pEncMb;
+    uint8_t* const kpEncMb  = pMe->pEncMb;
     uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + *pMvdCost);
     if (uiSadCost < uiBestCost) {
-      uiBestCost	= uiSadCost;
-      iBestPos	= iTargetPos;
+      uiBestCost  = uiSadCost;
+      iBestPos  = iTargetPos;
     }
     pRef += kiStride;
     pMvdCost+=4;
@@ -400,8 +408,8 @@
 int32_t RequestScreenBlockFeatureStorage( CMemoryAlign *pMa, const int32_t kiFeatureStrategyIndex,
                                          const int32_t kiFrameWidth,  const int32_t kiFrameHeight, const int32_t kiMe16x16,  const int32_t kiMe8x8,
                                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage) {
-#define LIST_SIZE_SUM_16x16	0x0FF01		//(256*255+1)
-#define LIST_SIZE_SUM_8x8	    0x03FC1		//(64*255+1)
+#define LIST_SIZE_SUM_16x16  0x0FF01    //(256*255+1)
+#define LIST_SIZE_SUM_8x8      0x03FC1    //(64*255+1)
 
   if (((kiMe8x8&ME_FME)==ME_FME) && ((kiMe16x16&ME_FME)==ME_FME)) {
     return ENC_RETURN_UNSUPPORTED_PARA;
@@ -411,7 +419,7 @@
   const bool bIsBlock8x8 = ((kiMe8x8 & ME_FME)==ME_FME);
   const int32_t kiMarginSize = bIsBlock8x8?8:16;
   const int32_t kiFrameSize = (kiFrameWidth-kiMarginSize) * (kiFrameHeight-kiMarginSize);
-  const int32_t kiListSize	= (0==kiFeatureStrategyIndex)?(bIsBlock8x8 ? LIST_SIZE_SUM_8x8 : LIST_SIZE_SUM_16x16):256;
+  const int32_t kiListSize  = (0==kiFeatureStrategyIndex)?(bIsBlock8x8 ? LIST_SIZE_SUM_8x8 : LIST_SIZE_SUM_16x16):256;
 
   pScreenBlockFeatureStorage->pTimesOfFeatureValue = (uint32_t*)pMa->WelsMalloc(kiListSize*sizeof(uint32_t),"pScreenBlockFeatureStorage->pTimesOfFeatureValue");
   WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pTimesOfFeatureValue)
@@ -422,7 +430,7 @@
   pScreenBlockFeatureStorage->pLocationPointer = (uint16_t*)pMa->WelsMalloc(2*kiFrameSize*sizeof(uint16_t), "pScreenBlockFeatureStorage->pLocationPointer");
   WELS_VERIFY_RETURN_IF(ENC_RETURN_MEMALLOCERR, NULL == pScreenBlockFeatureStorage->pLocationPointer)
 
-  pScreenBlockFeatureStorage->iActualListSize	= kiListSize;
+  pScreenBlockFeatureStorage->iActualListSize  = kiListSize;
   return ENC_RETURN_SUCCESS;
 }
 int32_t ReleaseScreenBlockFeatureStorage( CMemoryAlign *pMa, SScreenBlockFeatureStorage* pScreenBlockFeatureStorage ) {
@@ -440,6 +448,137 @@
   }
   return ENC_RETURN_UNEXPECTED;
 }
+
+//preprocess related
+int32_t SumOf8x8SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride)
+{
+  int32_t iSum = 0, i;
+  for(i = 0; i < 8; i++)
+  {
+    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
+    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
+    pRef += kiRefStride;
+  }
+  return iSum;
+}
+int32_t SumOf16x16SingleBlock_c(uint8_t* pRef, const int32_t kiRefStride)
+{
+  int32_t iSum = 0, i;
+  for(i = 0; i < 16; i++)
+  {
+    iSum +=  pRef[0]    + pRef[1]  + pRef[2]  + pRef[3];
+    iSum +=  pRef[4]    + pRef[5]  + pRef[6]  + pRef[7];
+    iSum    +=  pRef[8]    + pRef[9]  + pRef[10]  + pRef[11];
+    iSum    +=  pRef[12]  + pRef[13]  + pRef[14]  + pRef[15];
+    pRef += kiRefStride;
+  }
+  return iSum;
+}
+
+void SumOf8x8BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+{
+  int32_t x, y;
+  uint8_t *pRef;
+  uint16_t *pBuffer;
+  int32_t iSum;
+  for(y = 0; y < kiHeight; y++) {
+    pRef = pRefPicture  + kiRefStride * y;
+    pBuffer  = pFeatureOfBlock + kiWidth * y;
+    for(x = 0; x < kiWidth; x++) {
+      iSum = SumOf8x8SingleBlock_c(pRef + x, kiRefStride);
+
+      pBuffer[x] = iSum;
+      pTimesOfFeatureValue[iSum]++;
+    }
+  }
+}
+
+void SumOf16x16BlockOfFrame_c(uint8_t *pRefPicture, const int32_t kiWidth, const int32_t kiHeight, const int32_t kiRefStride,
+                                              uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
+{//TODO: this is similar to SumOf8x8BlockOfFrame_c expect the calling of single block func, refactor-able?
+  int32_t x, y;
+  uint8_t *pRef;
+  uint16_t *pBuffer;
+  int32_t iSum;
+  for(y = 0; y < kiHeight; y++) {
+    pRef = pRefPicture  + kiRefStride * y;
+    pBuffer  = pFeatureOfBlock + kiWidth * y;
+    for(x = 0; x < kiWidth; x++) {
+      iSum = SumOf16x16SingleBlock_c(pRef + x, kiRefStride);
+
+      pBuffer[x] = iSum;
+      pTimesOfFeatureValue[iSum]++;
+    }
+  }
+}
+
+void InitializeHashforFeature_c( uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize,
+                                uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList )
+{
+  //assign location pointer
+  uint16_t *pBufPos  = pBuf;
+  for( int32_t i = 0 ; i < kiListSize; ++i )
+  {
+    pLocationOfFeature[i] =
+      pFeatureValuePointerList[i] = pBufPos;
+    pBufPos      += (pTimesOfFeatureValue[i]<<1);
+  }
+}
+void FillQpelLocationByFeatureValue_c( uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight,
+                                       uint16_t** pFeatureValuePointerList )
+{
+  //assign each pixel's position
+  uint16_t* pSrcPointer  =  pFeatureOfBlock;
+  int32_t iQpelY = 0;
+  for(int32_t y = 0; y < kiHeight; y++)
+  {
+    for(int32_t x = 0; x < kiWidth; x++)
+    {
+      uint16_t uiFeature = pSrcPointer[x];
+      ST32( &pFeatureValuePointerList[uiFeature][0], ((iQpelY<<16)|(x<<2)) );
+      pFeatureValuePointerList[uiFeature] += 2;
+    }
+    iQpelY += 4;
+    pSrcPointer += kiWidth;
+  }
+}
+void CalculateFeatureOfBlock( SWelsFuncPtrList *pFunc, SPicture* pRef,
+                         SScreenBlockFeatureStorage* pScreenBlockFeatureStorage)
+{
+  uint16_t* pFeatureOfBlock = pScreenBlockFeatureStorage->pFeatureOfBlockPointer;
+  uint32_t* pTimesOfFeatureValue = pScreenBlockFeatureStorage->pTimesOfFeatureValue;
+  uint16_t** pLocationOfFeature  = pScreenBlockFeatureStorage->pLocationOfFeature;
+  uint16_t* pBuf = pScreenBlockFeatureStorage->pLocationPointer;
+
+  uint8_t* pRefData = pRef->pData[0];
+  const int32_t iRefStride = pRef->iLineSize[0];
+  int32_t iIs16x16 = pScreenBlockFeatureStorage->iIs16x16;
+  bool bUseSum = (pScreenBlockFeatureStorage->uiFeatureStrategyIndex == 0);
+  const int32_t iEdgeDiscard = (iIs16x16?16:8);//this is to save complexity of padding on pRef
+  const int32_t iWidth = pRef->iWidthInPixel - iEdgeDiscard;
+  const int32_t kiHeight = pRef->iHeightInPixel - iEdgeDiscard;
+  const int32_t kiActualListSize = pScreenBlockFeatureStorage->iActualListSize;
+  uint16_t* pFeatureValuePointerList[WELS_MAX(LIST_SIZE_SUM_16x16,LIST_SIZE_MSE_16x16)] = {0};
+
+  memset(pTimesOfFeatureValue, 0, sizeof(int32_t)*kiActualListSize);
+  (pFunc->pfCalculateBlockFeatureOfFrame[iIs16x16])(pRefData,iWidth, kiHeight, iRefStride, pFeatureOfBlock, pTimesOfFeatureValue);
+
+  //assign pLocationOfFeature pointer
+  InitializeHashforFeature_c( pTimesOfFeatureValue, pBuf, kiActualListSize,
+    pLocationOfFeature, pFeatureValuePointerList );
+
+  //assign each pixel's pLocationOfFeature
+  FillQpelLocationByFeatureValue_c(pFeatureOfBlock, iWidth, kiHeight, pFeatureValuePointerList);
+}
+
+void PerformFMEPreprocess( SWelsFuncPtrList *pFunc, SPicture* pRef,
+                          SScreenBlockFeatureStorage* pScreenBlockFeatureStorage)
+{
+    CalculateFeatureOfBlock(pFunc, pRef, pScreenBlockFeatureStorage );
+    pScreenBlockFeatureStorage->bRefBlockFeatureCalculated = true;
+}
+
 //search related
 void SetFeatureSearchIn( SWelsFuncPtrList *pFunc,  const SWelsME& sMe,
                         const SSlice *pSlice, SScreenBlockFeatureStorage* pRefFeatureStorage,
@@ -446,7 +585,7 @@
                         const int32_t kiEncStride, const int32_t kiRefStride,
                         SFeatureSearchIn* pFeatureSearchIn ) {
   pFeatureSearchIn->pSad = pFunc->sSampleDealingFuncs.pfSampleSad[sMe.uiBlockSize];
-  //pFeatureSearchIn->iFeatureOfCurrent=
+  pFeatureSearchIn->iFeatureOfCurrent=pFunc->pfCalculateSingleBlockFeature[BLOCK_16x16==sMe.uiBlockSize](sMe.pEncMb, kiEncStride);
 
   pFeatureSearchIn->pEnc       = sMe.pEncMb;
   pFeatureSearchIn->pColoRef = sMe.pColoRefMb;