shithub: openh264

Download patch

ref: c3cfce52234cff956e541896886c25d8e810ccba
parent: 4da932426cd55708d3fc0227c4b7cc59f125fcad
author: huili2 <huili2@cisco.com>
date: Mon Jun 1 11:11:20 EDT 2015

modify some functions extending to sub8x8 usage, especially in ME part

--- a/codec/common/inc/copy_mb.h
+++ b/codec/common/inc/copy_mb.h
@@ -38,7 +38,9 @@
 /****************************************************************************
  * Copy functions
  ****************************************************************************/
-void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);	//
 void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);	//
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -41,8 +41,8 @@
 int32_t WelsSampleSad16x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
 int32_t WelsSampleSad8x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
 int32_t WelsSampleSad8x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
-//int32_t WelsSampleSad8x4( uint8_t *, int32_t, uint8_t *, int32_t );
-//int32_t WelsSampleSad4x8( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad8x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad4x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
 int32_t WelsSampleSad4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
 
 
@@ -52,6 +52,8 @@
 void WelsSampleSadFour8x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
 void WelsSampleSadFour8x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
 void WelsSampleSadFour4x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour8x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour4x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
 
 #if defined(__cplusplus)
 extern "C" {
--- a/codec/common/src/copy_mb.cpp
+++ b/codec/common/src/copy_mb.cpp
@@ -45,7 +45,7 @@
 /****************************************************************************
  * Copy functions
  ****************************************************************************/
-void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
   const int32_t kiSrcStride2 = iStrideS << 1;
   const int32_t kiSrcStride3 = iStrideS + kiSrcStride2;
   const int32_t kiDstStride2 = iStrideD << 1;
@@ -55,6 +55,14 @@
   ST32 (pDst + iStrideD,	LD32 (pSrc + iStrideS));
   ST32 (pDst + kiDstStride2, LD32 (pSrc + kiSrcStride2));
   ST32 (pDst + kiDstStride3, LD32 (pSrc + kiSrcStride3));
+}
+void WelsCopy8x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  WelsCopy4x4_c (pDst, iStrideD, pSrc, iStrideS);
+  WelsCopy4x4_c (pDst + 4, iStrideD, pSrc + 4, iStrideS);
+}
+void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+  WelsCopy4x4_c (pDst, iStrideD, pSrc, iStrideS);
+  WelsCopy4x4_c (pDst + (iStrideD << 2), iStrideD, pSrc + (iStrideS << 2), iStrideS);
 }
 void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
   int32_t i;
--- a/codec/common/src/sad_common.cpp
+++ b/codec/common/src/sad_common.cpp
@@ -59,6 +59,20 @@
   return iSadSum;
 }
 
+int32_t WelsSampleSad8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  iSadSum += WelsSampleSad4x4_c (pSample1,     iStride1, pSample2,     iStride2);
+  iSadSum += WelsSampleSad4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
+  return iSadSum;
+}
+
+int32_t WelsSampleSad4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  iSadSum += WelsSampleSad4x4_c (pSample1,                   iStride1, pSample2,                   iStride2);
+  iSadSum += WelsSampleSad4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
+  return iSadSum;
+}
+
 int32_t WelsSampleSad8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
   int32_t iSadSum = 0;
   int32_t i = 0;
@@ -136,4 +150,16 @@
   * (pSad + 1) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
   * (pSad + 2) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
   * (pSad + 3) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour8x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+  * (pSad)     = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour4x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+  * (pSad)     = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+  * (pSad + 1) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+  * (pSad + 2) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+  * (pSad + 3) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 + 1), iStride2);
 }
--- a/codec/encoder/core/inc/mb_cache.h
+++ b/codec/encoder/core/inc/mb_cache.h
@@ -79,7 +79,7 @@
 //	must follow with iNonZeroCoeffCount!
 
 int32_t     iSadCost[4];			//avail 1; unavail 0
-SMVUnitXY  sMbMvp[MB_BLOCK8x8_NUM];// for write bs
+SMVUnitXY  sMbMvp[MB_BLOCK4x4_NUM];// for write bs
 
 //for residual decoding (recovery) at the side of Encoder
 int16_t* pCoeffLevel;		// tmep
--- a/codec/encoder/core/inc/md.h
+++ b/codec/encoder/core/inc/md.h
@@ -109,6 +109,9 @@
   SWelsME			sMe8x8[4];
   SWelsME			sMe16x8[2];
   SWelsME			sMe8x16[2];
+  SWelsME			sMe4x4[4][4];
+  SWelsME			sMe8x4[4][2];
+  SWelsME			sMe4x8[4][2];
 //		SMVUnitXY		i_mvbs[MB_BLOCK8x8_NUM];	//scaled MVB
 } sMe;
 
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@@ -142,9 +142,9 @@
 BLOCK_8x16     = 2,
 BLOCK_8x8      = 3,
 BLOCK_4x4      = 4,
-// BLOCK_8x4      = 5,
-// BLOCK_4x8      = 6,
-BLOCK_SIZE_ALL = 5
+BLOCK_8x4      = 5,
+BLOCK_4x8      = 6,
+BLOCK_SIZE_ALL = 7
 };
 
 typedef enum {
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -155,7 +155,7 @@
 typedef int32_t (*PCalculateSingleBlockFeature) (uint8_t* pRef, const int32_t kiRefStride);
 typedef void (*PUpdateFMESwitch) (SDqLayer* pCurLayer);
 
-#define     MAX_BLOCK_TYPE 5 // prev 7
+#define     MAX_BLOCK_TYPE BLOCK_SIZE_ALL
 typedef struct TagSampleDealingFunc {
   PSampleSadSatdCostFunc            pfSampleSad[MAX_BLOCK_TYPE];
   PSampleSadSatdCostFunc            pfSampleSatd[MAX_BLOCK_TYPE];
@@ -235,8 +235,10 @@
   PCopyFunc      pfCopy8x8Aligned;    //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
   PCopyFunc    pfCopy16x8NotAligned;  //for MeRefineFracPixel 16x8 based
   PCopyFunc    pfCopy8x16Aligned;    //for MeRefineFracPixel 8x16 based
+  PCopyFunc      pfCopy4x4;    //not sure if aligned or not, need further tune
+  PCopyFunc      pfCopy8x4;    //not sure if aligned or not, need further tune
+  PCopyFunc      pfCopy4x8;    //not sure if aligned or not, need further tune
 
-  //svc_encode_mb.c encode_mb_aux.c
   PDctFunc          pfDctT4;
   PDctFunc                pfDctFourT4;
 
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -467,7 +467,9 @@
     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16_c;
   pFuncList->pfCopy16x8NotAligned       = WelsCopy16x8_c;
   pFuncList->pfCopy8x16Aligned          = WelsCopy8x16_c;
-
+  pFuncList->pfCopy4x4           = WelsCopy4x4_c;
+  pFuncList->pfCopy8x4           = WelsCopy8x4_c;
+  pFuncList->pfCopy4x8           = WelsCopy4x8_c;
   pFuncList->pfQuantizationHadamard2x2          = WelsHadamardQuant2x2_c;
   pFuncList->pfQuantizationHadamard2x2Skip      = WelsHadamardQuant2x2Skip_c;
   pFuncList->pfTransformHadamard4x4Dc           = WelsHadamardT4Dc_c;
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -2983,7 +2983,9 @@
       pFuncList->pfSearchMethod[BLOCK_16x8] =
         pFuncList->pfSearchMethod[BLOCK_8x16] =
           pFuncList->pfSearchMethod[BLOCK_8x8] =
-            pFuncList->pfSearchMethod[BLOCK_4x4] = WelsDiamondSearch;
+            pFuncList->pfSearchMethod[BLOCK_4x4] =
+              pFuncList->pfSearchMethod[BLOCK_8x4] =
+                pFuncList->pfSearchMethod[BLOCK_4x8] = WelsDiamondSearch;
     pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
     pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
     pFuncList->pfSetScrollingMv = SetScrollingMvToMdNull;
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -95,6 +95,21 @@
 
   return ((iSatdSum + 1) >> 1);
 }
+
+int32_t WelsSampleSatd8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  iSatdSum += WelsSampleSatd4x4_c (pSample1,   iStride1, pSample2,   iStride2);
+  iSatdSum += WelsSampleSatd4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
+  return iSatdSum;
+}
+
+int32_t WelsSampleSatd4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  iSatdSum += WelsSampleSatd4x4_c (pSample1,                   iStride1, pSample2,                   iStride2);
+  iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
+  return iSatdSum;
+}
+
 int32_t WelsSampleSatd8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
   int32_t iSatdSum = 0;
 
@@ -325,6 +340,8 @@
   pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
   pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8  ] = WelsSampleSad8x8_c;
   pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x4  ] = WelsSampleSad8x4_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x8  ] = WelsSampleSad4x8_c;
 
   //pfSampleSatd init
   pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
@@ -332,6 +349,8 @@
   pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
   pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_c;
   pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x4  ] = WelsSampleSatd8x4_c;
+  pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x8  ] = WelsSampleSatd4x8_c;
 
   pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
   pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
@@ -338,6 +357,8 @@
   pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
   pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
   pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x4] = WelsSampleSadFour8x4_c;
+  pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x8] = WelsSampleSadFour4x8_c;
 
   pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd   = NULL;
   pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd   = NULL;
--- a/codec/encoder/core/src/svc_encode_mb.cpp
+++ b/codec/encoder/core/src/svc_encode_mb.cpp
@@ -174,7 +174,7 @@
     pFuncList->pfDequantization4x4 (pResI4x4, g_kuiDequantCoeff[uiQp]);
     pFuncList->pfIDctT4 (pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
   } else
-    WelsCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
+    pFuncList->pfCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
 }
 
 void WelsEncInterY (SWelsFuncPtrList* pFuncList, SMB* pCurMb, SMbCache* pMbCache) {
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -241,7 +241,9 @@
       EXPECT_EQ(ref_dst[i*iDStride+j], dst[i*iDStride+j]); \
 }
 
-GENERATE_UT_FOR_COPY (4, 4, WelsCopy4x4);
+GENERATE_UT_FOR_COPY (4, 4, WelsCopy4x4_c);
+GENERATE_UT_FOR_COPY (8, 4, WelsCopy8x4_c);
+GENERATE_UT_FOR_COPY (4, 8, WelsCopy4x8_c);
 GENERATE_UT_FOR_COPY (8, 8, WelsCopy8x8_c);
 GENERATE_UT_FOR_COPY (8, 16, WelsCopy8x16_c);
 GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8_c);
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -188,6 +188,42 @@
   EXPECT_EQ (WelsSampleSad4x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
 }
 
+TEST_F (SadSatdCFuncTest, WelsSampleSad8x4_c) {
+  for (int i = 0; i < (m_iStrideA << 2); i++)
+    m_pPixSrcA[i] = rand() % 256;
+  for (int i = 0; i < (m_iStrideB << 2); i++)
+    m_pPixSrcB[i] = rand() % 256;
+  uint8_t* pPixA = m_pPixSrcA;
+  uint8_t* pPixB = m_pPixSrcB;
+
+  int32_t iSumSad = 0;
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 8; j++)
+      iSumSad += abs (pPixA[j] - pPixB[j]);
+    pPixA += m_iStrideA;
+    pPixB += m_iStrideB;
+  }
+  EXPECT_EQ (WelsSampleSad8x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
+}
+
+TEST_F (SadSatdCFuncTest, WelsSampleSad4x8_c) {
+  for (int i = 0; i < (m_iStrideA << 2); i++)
+    m_pPixSrcA[i] = rand() % 256;
+  for (int i = 0; i < (m_iStrideB << 2); i++)
+    m_pPixSrcB[i] = rand() % 256;
+  uint8_t* pPixA = m_pPixSrcA;
+  uint8_t* pPixB = m_pPixSrcB;
+
+  int32_t iSumSad = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 4; j++)
+      iSumSad += abs (pPixA[j] - pPixB[j]);
+    pPixA += m_iStrideA;
+    pPixB += m_iStrideB;
+  }
+  EXPECT_EQ (WelsSampleSad4x8_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
+}
+
 TEST_F (SadSatdCFuncTest, WelsSampleSad8x8_c) {
   for (int i = 0; i < (m_iStrideA << 3); i++)
     m_pPixSrcA[i] = rand() % 256;
@@ -444,6 +480,51 @@
   EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
 }
 
+TEST_F (SadSatdCFuncTest, WelsSampleSadFour8x4_c) {
+  for (int i = 0; i < (m_iStrideA << 3); i++)
+    m_pPixSrcA[i] = rand() % 256;
+  for (int i = 0; i < (m_iStrideB << 3); i++)
+    m_pPixSrcB[i] = rand() % 256;
+  uint8_t* pPixA = m_pPixSrcA;
+  uint8_t* pPixB = m_pPixSrcB + m_iStrideB;
+
+  int32_t iSumSad = 0;
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 8; j++) {
+      iSumSad += abs (pPixA[j] - pPixB[j - 1]);
+      iSumSad += abs (pPixA[j] - pPixB[j + 1]);
+      iSumSad += abs (pPixA[j] - pPixB[j - m_iStrideB]);
+      iSumSad += abs (pPixA[j] - pPixB[j + m_iStrideB]);
+    }
+    pPixA += m_iStrideA;
+    pPixB += m_iStrideB;
+  }
+  WelsSampleSadFour8x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB + m_iStrideB, m_iStrideB, m_pSad);
+  EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
+}
+
+TEST_F (SadSatdCFuncTest, WelsSampleSadFour4x8_c) {
+  for (int i = 0; i < (m_iStrideA << 3); i++)
+    m_pPixSrcA[i] = rand() % 256;
+  for (int i = 0; i < (m_iStrideB << 3); i++)
+    m_pPixSrcB[i] = rand() % 256;
+  uint8_t* pPixA = m_pPixSrcA;
+  uint8_t* pPixB = m_pPixSrcB + m_iStrideB;
+
+  int32_t iSumSad = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 4; j++) {
+      iSumSad += abs (pPixA[j] - pPixB[j - 1]);
+      iSumSad += abs (pPixA[j] - pPixB[j + 1]);
+      iSumSad += abs (pPixA[j] - pPixB[j - m_iStrideB]);
+      iSumSad += abs (pPixA[j] - pPixB[j + m_iStrideB]);
+    }
+    pPixA += m_iStrideA;
+    pPixB += m_iStrideB;
+  }
+  WelsSampleSadFour4x8_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB + m_iStrideB, m_iStrideB, m_pSad);
+  EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
+}
 
 class SadSatdAssemblyFuncTest : public testing::Test {
  public: