ref: c3cfce52234cff956e541896886c25d8e810ccba
parent: 4da932426cd55708d3fc0227c4b7cc59f125fcad
author: huili2 <huili2@cisco.com>
date: Mon Jun 1 11:11:20 EDT 2015
modify some functions extending to sub8x8 usage, especially in ME part
--- a/codec/common/inc/copy_mb.h
+++ b/codec/common/inc/copy_mb.h
@@ -38,7 +38,9 @@
/****************************************************************************
* Copy functions
****************************************************************************/
-void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS); //
void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS); //
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -41,8 +41,8 @@
int32_t WelsSampleSad16x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad8x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad8x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
-//int32_t WelsSampleSad8x4( uint8_t *, int32_t, uint8_t *, int32_t );
-//int32_t WelsSampleSad4x8( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad8x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad4x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
int32_t WelsSampleSad4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
@@ -52,6 +52,8 @@
void WelsSampleSadFour8x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
void WelsSampleSadFour8x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
void WelsSampleSadFour4x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour8x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour4x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
#if defined(__cplusplus)
extern "C" {
--- a/codec/common/src/copy_mb.cpp
+++ b/codec/common/src/copy_mb.cpp
@@ -45,7 +45,7 @@
/****************************************************************************
* Copy functions
****************************************************************************/
-void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
const int32_t kiSrcStride2 = iStrideS << 1;
const int32_t kiSrcStride3 = iStrideS + kiSrcStride2;
const int32_t kiDstStride2 = iStrideD << 1;
@@ -55,6 +55,14 @@
ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS));
ST32 (pDst + kiDstStride2, LD32 (pSrc + kiSrcStride2));
ST32 (pDst + kiDstStride3, LD32 (pSrc + kiSrcStride3));
+}
+void WelsCopy8x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+ WelsCopy4x4_c (pDst, iStrideD, pSrc, iStrideS);
+ WelsCopy4x4_c (pDst + 4, iStrideD, pSrc + 4, iStrideS);
+}
+void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
+ WelsCopy4x4_c (pDst, iStrideD, pSrc, iStrideS);
+ WelsCopy4x4_c (pDst + (iStrideD << 2), iStrideD, pSrc + (iStrideS << 2), iStrideS);
}
void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
int32_t i;
--- a/codec/common/src/sad_common.cpp
+++ b/codec/common/src/sad_common.cpp
@@ -59,6 +59,20 @@
return iSadSum;
}
+int32_t WelsSampleSad8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ iSadSum += WelsSampleSad4x4_c (pSample1, iStride1, pSample2, iStride2);
+ iSadSum += WelsSampleSad4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
+ return iSadSum;
+}
+
+int32_t WelsSampleSad4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ iSadSum += WelsSampleSad4x4_c (pSample1, iStride1, pSample2, iStride2);
+ iSadSum += WelsSampleSad4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
+ return iSadSum;
+}
+
int32_t WelsSampleSad8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSadSum = 0;
int32_t i = 0;
@@ -136,4 +150,16 @@
* (pSad + 1) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
* (pSad + 2) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
* (pSad + 3) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour8x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+ * (pSad) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+ * (pSad + 1) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+ * (pSad + 2) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+ * (pSad + 3) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
+}
+void WelsSampleSadFour4x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
+ * (pSad) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
+ * (pSad + 1) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
+ * (pSad + 2) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 - 1), iStride2);
+ * (pSad + 3) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 + 1), iStride2);
}
--- a/codec/encoder/core/inc/mb_cache.h
+++ b/codec/encoder/core/inc/mb_cache.h
@@ -79,7 +79,7 @@
// must follow with iNonZeroCoeffCount!
int32_t iSadCost[4]; //avail 1; unavail 0
-SMVUnitXY sMbMvp[MB_BLOCK8x8_NUM];// for write bs
+SMVUnitXY sMbMvp[MB_BLOCK4x4_NUM];// for write bs
//for residual decoding (recovery) at the side of Encoder
int16_t* pCoeffLevel; // tmep
--- a/codec/encoder/core/inc/md.h
+++ b/codec/encoder/core/inc/md.h
@@ -109,6 +109,9 @@
SWelsME sMe8x8[4];
SWelsME sMe16x8[2];
SWelsME sMe8x16[2];
+ SWelsME sMe4x4[4][4];
+ SWelsME sMe8x4[4][2];
+ SWelsME sMe4x8[4][2];
// SMVUnitXY i_mvbs[MB_BLOCK8x8_NUM]; //scaled MVB
} sMe;
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@@ -142,9 +142,9 @@
BLOCK_8x16 = 2,
BLOCK_8x8 = 3,
BLOCK_4x4 = 4,
-// BLOCK_8x4 = 5,
-// BLOCK_4x8 = 6,
-BLOCK_SIZE_ALL = 5
+BLOCK_8x4 = 5,
+BLOCK_4x8 = 6,
+BLOCK_SIZE_ALL = 7
};
typedef enum {
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -155,7 +155,7 @@
typedef int32_t (*PCalculateSingleBlockFeature) (uint8_t* pRef, const int32_t kiRefStride);
typedef void (*PUpdateFMESwitch) (SDqLayer* pCurLayer);
-#define MAX_BLOCK_TYPE 5 // prev 7
+#define MAX_BLOCK_TYPE BLOCK_SIZE_ALL
typedef struct TagSampleDealingFunc {
PSampleSadSatdCostFunc pfSampleSad[MAX_BLOCK_TYPE];
PSampleSadSatdCostFunc pfSampleSatd[MAX_BLOCK_TYPE];
@@ -235,8 +235,10 @@
PCopyFunc pfCopy8x8Aligned; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
PCopyFunc pfCopy16x8NotAligned; //for MeRefineFracPixel 16x8 based
PCopyFunc pfCopy8x16Aligned; //for MeRefineFracPixel 8x16 based
+ PCopyFunc pfCopy4x4; //not sure if aligned or not, need further tune
+ PCopyFunc pfCopy8x4; //not sure if aligned or not, need further tune
+ PCopyFunc pfCopy4x8; //not sure if aligned or not, need further tune
- //svc_encode_mb.c encode_mb_aux.c
PDctFunc pfDctT4;
PDctFunc pfDctFourT4;
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -467,7 +467,9 @@
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_c;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_c;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_c;
-
+ pFuncList->pfCopy4x4 = WelsCopy4x4_c;
+ pFuncList->pfCopy8x4 = WelsCopy8x4_c;
+ pFuncList->pfCopy4x8 = WelsCopy4x8_c;
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_c;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_c;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_c;
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -2983,7 +2983,9 @@
pFuncList->pfSearchMethod[BLOCK_16x8] =
pFuncList->pfSearchMethod[BLOCK_8x16] =
pFuncList->pfSearchMethod[BLOCK_8x8] =
- pFuncList->pfSearchMethod[BLOCK_4x4] = WelsDiamondSearch;
+ pFuncList->pfSearchMethod[BLOCK_4x4] =
+ pFuncList->pfSearchMethod[BLOCK_8x4] =
+ pFuncList->pfSearchMethod[BLOCK_4x8] = WelsDiamondSearch;
pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
pFuncList->pfSetScrollingMv = SetScrollingMvToMdNull;
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -95,6 +95,21 @@
return ((iSatdSum + 1) >> 1);
}
+
+int32_t WelsSampleSatd8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
+ iSatdSum += WelsSampleSatd4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
+ iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
+ return iSatdSum;
+}
+
int32_t WelsSampleSatd8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSatdSum = 0;
@@ -325,6 +340,8 @@
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8 ] = WelsSampleSad8x8_c;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_c;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x4 ] = WelsSampleSad8x4_c;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x8 ] = WelsSampleSad4x8_c;
//pfSampleSatd init
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
@@ -332,6 +349,8 @@
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_c;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_c;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x4 ] = WelsSampleSatd8x4_c;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x8 ] = WelsSampleSatd4x8_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
@@ -338,6 +357,8 @@
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x4] = WelsSampleSadFour8x4_c;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x8] = WelsSampleSadFour4x8_c;
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = NULL;
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = NULL;
--- a/codec/encoder/core/src/svc_encode_mb.cpp
+++ b/codec/encoder/core/src/svc_encode_mb.cpp
@@ -174,7 +174,7 @@
pFuncList->pfDequantization4x4 (pResI4x4, g_kuiDequantCoeff[uiQp]);
pFuncList->pfIDctT4 (pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
} else
- WelsCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
+ pFuncList->pfCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
}
void WelsEncInterY (SWelsFuncPtrList* pFuncList, SMB* pCurMb, SMbCache* pMbCache) {
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -241,7 +241,9 @@
EXPECT_EQ(ref_dst[i*iDStride+j], dst[i*iDStride+j]); \
}
-GENERATE_UT_FOR_COPY (4, 4, WelsCopy4x4);
+GENERATE_UT_FOR_COPY (4, 4, WelsCopy4x4_c);
+GENERATE_UT_FOR_COPY (8, 4, WelsCopy8x4_c);
+GENERATE_UT_FOR_COPY (4, 8, WelsCopy4x8_c);
GENERATE_UT_FOR_COPY (8, 8, WelsCopy8x8_c);
GENERATE_UT_FOR_COPY (8, 16, WelsCopy8x16_c);
GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8_c);
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -188,6 +188,42 @@
EXPECT_EQ (WelsSampleSad4x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
}
+TEST_F (SadSatdCFuncTest, WelsSampleSad8x4_c) {
+ for (int i = 0; i < (m_iStrideA << 2); i++)
+ m_pPixSrcA[i] = rand() % 256;
+ for (int i = 0; i < (m_iStrideB << 2); i++)
+ m_pPixSrcB[i] = rand() % 256;
+ uint8_t* pPixA = m_pPixSrcA;
+ uint8_t* pPixB = m_pPixSrcB;
+
+ int32_t iSumSad = 0;
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 8; j++)
+ iSumSad += abs (pPixA[j] - pPixB[j]);
+ pPixA += m_iStrideA;
+ pPixB += m_iStrideB;
+ }
+ EXPECT_EQ (WelsSampleSad8x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
+}
+
+TEST_F (SadSatdCFuncTest, WelsSampleSad4x8_c) {
+ for (int i = 0; i < (m_iStrideA << 2); i++)
+ m_pPixSrcA[i] = rand() % 256;
+ for (int i = 0; i < (m_iStrideB << 2); i++)
+ m_pPixSrcB[i] = rand() % 256;
+ uint8_t* pPixA = m_pPixSrcA;
+ uint8_t* pPixB = m_pPixSrcB;
+
+ int32_t iSumSad = 0;
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 4; j++)
+ iSumSad += abs (pPixA[j] - pPixB[j]);
+ pPixA += m_iStrideA;
+ pPixB += m_iStrideB;
+ }
+ EXPECT_EQ (WelsSampleSad4x8_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
+}
+
TEST_F (SadSatdCFuncTest, WelsSampleSad8x8_c) {
for (int i = 0; i < (m_iStrideA << 3); i++)
m_pPixSrcA[i] = rand() % 256;
@@ -444,6 +480,51 @@
EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
}
+TEST_F (SadSatdCFuncTest, WelsSampleSadFour8x4_c) {
+ for (int i = 0; i < (m_iStrideA << 3); i++)
+ m_pPixSrcA[i] = rand() % 256;
+ for (int i = 0; i < (m_iStrideB << 3); i++)
+ m_pPixSrcB[i] = rand() % 256;
+ uint8_t* pPixA = m_pPixSrcA;
+ uint8_t* pPixB = m_pPixSrcB + m_iStrideB;
+
+ int32_t iSumSad = 0;
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 8; j++) {
+ iSumSad += abs (pPixA[j] - pPixB[j - 1]);
+ iSumSad += abs (pPixA[j] - pPixB[j + 1]);
+ iSumSad += abs (pPixA[j] - pPixB[j - m_iStrideB]);
+ iSumSad += abs (pPixA[j] - pPixB[j + m_iStrideB]);
+ }
+ pPixA += m_iStrideA;
+ pPixB += m_iStrideB;
+ }
+ WelsSampleSadFour8x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB + m_iStrideB, m_iStrideB, m_pSad);
+ EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
+}
+
+TEST_F (SadSatdCFuncTest, WelsSampleSadFour4x8_c) {
+ for (int i = 0; i < (m_iStrideA << 3); i++)
+ m_pPixSrcA[i] = rand() % 256;
+ for (int i = 0; i < (m_iStrideB << 3); i++)
+ m_pPixSrcB[i] = rand() % 256;
+ uint8_t* pPixA = m_pPixSrcA;
+ uint8_t* pPixB = m_pPixSrcB + m_iStrideB;
+
+ int32_t iSumSad = 0;
+ for (int i = 0; i < 8; i++) {
+ for (int j = 0; j < 4; j++) {
+ iSumSad += abs (pPixA[j] - pPixB[j - 1]);
+ iSumSad += abs (pPixA[j] - pPixB[j + 1]);
+ iSumSad += abs (pPixA[j] - pPixB[j - m_iStrideB]);
+ iSumSad += abs (pPixA[j] - pPixB[j + m_iStrideB]);
+ }
+ pPixA += m_iStrideA;
+ pPixB += m_iStrideB;
+ }
+ WelsSampleSadFour4x8_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB + m_iStrideB, m_iStrideB, m_pSad);
+ EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
+}
class SadSatdAssemblyFuncTest : public testing::Test {
public: