ref: 51d8e00564922a0fb5d791aed6b39b3083606994
parent: e0cee02d775196bd753a67d98b2f7d710cd35b52
parent: ed133d4c3deaee9367f4fdef2a6f55e464866f34
author: HaiboZhu <haibozhu@cisco.com>
date: Mon Oct 26 05:02:51 EDT 2015
Merge pull request #2180 from saamas/cabac_encode_opt [Encoder] CABAC optimizations
--- a/codec/encoder/core/inc/set_mb_syn_cabac.h
+++ b/codec/encoder/core/inc/set_mb_syn_cabac.h
@@ -50,29 +50,33 @@
#define WELS_QP_MAX 51
+typedef uint64_t cabac_low_t;
+enum { CABAC_LOW_WIDTH = sizeof (cabac_low_t) / sizeof (uint8_t) * 8 };
+
typedef struct TagStateCtx {
- uint8_t m_uiState;
- uint8_t m_uiValMps;
+ // Packed representation of state and MPS as state << 1 | MPS.
+ uint8_t m_uiStateMps;
+
+ uint8_t Mps() const { return m_uiStateMps & 1; }
+ uint8_t State() const { return m_uiStateMps >> 1; }
+ void Set (uint8_t uiState, uint8_t uiMps) { m_uiStateMps = uiState * 2 + uiMps; }
} SStateCtx;
typedef struct TagCabacCtx {
- uint32_t m_uiLow;
+ cabac_low_t m_uiLow;
+ int32_t m_iLowBitCnt;
+ int32_t m_iRenormCnt;
uint32_t m_uiRange;
SStateCtx m_sStateCtx[WELS_CONTEXT_COUNT];
uint8_t* m_pBufStart;
uint8_t* m_pBufEnd;
uint8_t* m_pBufCur;
- uint8_t m_iBitsOutstanding;
- uint32_t m_uData;
- uint32_t m_uiBitsUsed;
- uint32_t m_iFirstFlag;
- uint32_t m_uiBinCountsInNalUnits;
} SCabacCtx;
void WelsCabacContextInit (void* pCtx, SCabacCtx* pCbCtx, int32_t iModel);
void WelsCabacEncodeInit (SCabacCtx* pCbCtx, uint8_t* pBuf, uint8_t* pEnd);
-void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin);
-void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin);
+inline void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin);
+inline void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin);
void WelsCabacEncodeTerminate (SCabacCtx* pCbCtx, uint32_t uiBin);
void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVal);
void WelsCabacEncodeFlush (SCabacCtx* pCbCtx);
@@ -80,6 +84,44 @@
int32_t WriteBlockResidualCabac (void* pEncCtx, int16_t* pCoffLevel, int32_t iEndIdx,
int32_t iCalRunLevelFlag,
int32_t iResidualProperty, int8_t iNC, SBitStringAux* pBs);
+
+
+// private functions used by public inline functions.
+void WelsCabacEncodeDecisionLps_ (SCabacCtx* pCbCtx, int32_t iCtx);
+void WelsCabacEncodeUpdateLowNontrivial_ (SCabacCtx* pCbCtx);
+inline void WelsCabacEncodeUpdateLow_ (SCabacCtx* pCbCtx) {
+ if (pCbCtx->m_iLowBitCnt + pCbCtx->m_iRenormCnt < CABAC_LOW_WIDTH) {
+ pCbCtx->m_iLowBitCnt += pCbCtx->m_iRenormCnt;
+ pCbCtx->m_uiLow <<= pCbCtx->m_iRenormCnt;
+ } else {
+ WelsCabacEncodeUpdateLowNontrivial_ (pCbCtx);
+ }
+ pCbCtx->m_iRenormCnt = 0;
+}
+
+// inline function definitions.
+void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin) {
+ if (uiBin == pCbCtx->m_sStateCtx[iCtx].Mps()) {
+ const int32_t kiState = pCbCtx->m_sStateCtx[iCtx].State();
+ uint32_t uiRange = pCbCtx->m_uiRange;
+ uint32_t uiRangeLps = g_kuiCabacRangeLps[kiState][(uiRange & 0xff) >> 6];
+ uiRange -= uiRangeLps;
+
+ const int32_t kiRenormAmount = uiRange >> 8 ^ 1;
+ pCbCtx->m_uiRange = uiRange << kiRenormAmount;
+ pCbCtx->m_iRenormCnt += kiRenormAmount;
+ pCbCtx->m_sStateCtx[iCtx].Set (g_kuiStateTransTable[kiState][1], uiBin);
+ } else {
+ WelsCabacEncodeDecisionLps_ (pCbCtx, iCtx);
+ }
+}
+
+void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin) {
+ const uint32_t kuiBinBitmask = -uiBin;
+ pCbCtx->m_iRenormCnt++;
+ WelsCabacEncodeUpdateLow_ (pCbCtx);
+ pCbCtx->m_uiLow += kuiBinBitmask & pCbCtx->m_uiRange;
+}
}
#endif
--- a/codec/encoder/core/src/set_mb_syn_cabac.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cabac.cpp
@@ -42,10 +42,25 @@
#include "macros.h"
#include "set_mb_syn_cabac.h"
#include "encoder.h"
+#include "golomb_common.h"
-namespace WelsEnc {
+namespace {
+const int8_t g_kiClz5Table[32] = {
+ 6, 5, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+void PropagateCarry (uint8_t* pBufCur, uint8_t* pBufStart) {
+ for (; pBufCur > pBufStart; --pBufCur)
+ if (++*(pBufCur - 1))
+ break;
+}
+
+} // anon ns.
+
+namespace WelsEnc {
+
void WelsCabacInit (void* pCtx) {
sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
for (int32_t iModel = 0; iModel < 4; iModel++) {
@@ -63,8 +78,7 @@
uiStateIdx = iPreCtxState - 64;
uiValMps = 1;
}
- pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].m_uiState = uiStateIdx;
- pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].m_uiValMps = uiValMps;
+ pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].Set (uiStateIdx, uiValMps);
}
}
}
@@ -79,121 +93,76 @@
void WelsCabacEncodeInit (SCabacCtx* pCbCtx, uint8_t* pBuf, uint8_t* pEnd) {
pCbCtx->m_uiLow = 0;
+ pCbCtx->m_iLowBitCnt = 9;
+ pCbCtx->m_iRenormCnt = 0;
pCbCtx->m_uiRange = 510;
- pCbCtx->m_iBitsOutstanding = 0;
- pCbCtx->m_uData = 0;
- pCbCtx->m_uiBitsUsed = 0;
- pCbCtx->m_iFirstFlag = 1;
pCbCtx->m_pBufStart = pBuf;
pCbCtx->m_pBufEnd = pEnd;
pCbCtx->m_pBufCur = pBuf;
- pCbCtx->m_uiBinCountsInNalUnits = 0;
}
-void WelsCabacPutBit (SCabacCtx* pCbCtx, uint32_t iValue) {
- if (pCbCtx->m_iFirstFlag != 0) {
- pCbCtx->m_iFirstFlag = 0;
- } else {
- pCbCtx->m_uData = (pCbCtx->m_uData << 1) | iValue;
- pCbCtx->m_uiBitsUsed++;
- }
- if (pCbCtx->m_iBitsOutstanding == 0) {
- while (pCbCtx->m_uiBitsUsed >= 8) {
- pCbCtx->m_uiBitsUsed -= 8;
- uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
- if (pCbCtx->m_uiBitsUsed == 0)
- pCbCtx->m_uData = 0;
- else
- pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
- *pCbCtx->m_pBufCur ++ = uiByte;
- }
- } else {
+void WelsCabacEncodeUpdateLowNontrivial_ (SCabacCtx* pCbCtx) {
+ int32_t iLowBitCnt = pCbCtx->m_iLowBitCnt;
+ int32_t iRenormCnt = pCbCtx->m_iRenormCnt;
+ cabac_low_t uiLow = pCbCtx->m_uiLow;
- while (pCbCtx->m_iBitsOutstanding > 0) {
- pCbCtx->m_uData = (pCbCtx->m_uData << 1) | (1 - iValue);
- pCbCtx->m_iBitsOutstanding--;
- pCbCtx->m_uiBitsUsed++;
- while (pCbCtx->m_uiBitsUsed >= 8) {
- pCbCtx->m_uiBitsUsed -= 8;
- uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
- if (pCbCtx->m_uiBitsUsed == 0)
- pCbCtx->m_uData = 0;
- else
- pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
- *pCbCtx->m_pBufCur ++ = uiByte;
- }
+ do {
+ uint8_t* pBufCur = pCbCtx->m_pBufCur;
+ const int32_t kiInc = CABAC_LOW_WIDTH - 1 - iLowBitCnt;
+
+ uiLow <<= kiInc;
+ if (uiLow & cabac_low_t (1) << (CABAC_LOW_WIDTH - 1))
+ PropagateCarry (pBufCur, pCbCtx->m_pBufStart);
+
+ if (CABAC_LOW_WIDTH > 32) {
+ WRITE_BE_32 (pBufCur, uiLow >> 31);
+ pBufCur += 4;
}
- }
-}
-void WelsCabacEncodeRenorm (SCabacCtx* pCbCtx) {
- while (pCbCtx->m_uiRange < 256) {
- if (pCbCtx->m_uiLow < 256) {
- WelsCabacPutBit (pCbCtx, 0);
- } else {
- if (pCbCtx->m_uiLow >= 512) {
- pCbCtx->m_uiLow -= 512;
- WelsCabacPutBit (pCbCtx, 1);
- } else {
- pCbCtx->m_uiLow -= 256;
- pCbCtx->m_iBitsOutstanding++;
- }
- }
- pCbCtx->m_uiRange <<= 1;
- pCbCtx->m_uiLow <<= 1;
- }
+ *pBufCur++ = uiLow >> 23;
+ *pBufCur++ = uiLow >> 15;
+ iRenormCnt -= kiInc;
+ iLowBitCnt = 15;
+ uiLow &= (1u << iLowBitCnt) - 1;
+ pCbCtx->m_pBufCur = pBufCur;
+ } while (iLowBitCnt + iRenormCnt > CABAC_LOW_WIDTH - 1);
+
+ pCbCtx->m_iLowBitCnt = iLowBitCnt + iRenormCnt;
+ pCbCtx->m_uiLow = uiLow << iRenormCnt;
}
-void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin) {
- uint8_t uiState = pCbCtx->m_sStateCtx[iCtx].m_uiState;
- uint8_t uiValMps = pCbCtx->m_sStateCtx[iCtx].m_uiValMps;
- uint32_t uiRangeLps = g_kuiCabacRangeLps[uiState][ (pCbCtx->m_uiRange >> 6) & 3];
- pCbCtx->m_uiRange -= uiRangeLps;
- if (uiBin != uiValMps) { //LPS
- pCbCtx->m_uiLow += pCbCtx->m_uiRange;
- pCbCtx->m_uiRange = uiRangeLps;
- if (uiState == 0)
- uiValMps = 1 - uiValMps;
- pCbCtx->m_sStateCtx[iCtx].m_uiState = g_kuiStateTransTable[uiState][0];
- pCbCtx->m_sStateCtx[iCtx].m_uiValMps = uiValMps;
- } else {
- pCbCtx->m_sStateCtx[iCtx].m_uiState = g_kuiStateTransTable[uiState][1];
- }
- WelsCabacEncodeRenorm (pCbCtx);
- pCbCtx->m_uiBinCountsInNalUnits++;
-}
+void WelsCabacEncodeDecisionLps_ (SCabacCtx* pCbCtx, int32_t iCtx) {
+ const int32_t kiState = pCbCtx->m_sStateCtx[iCtx].State();
+ uint32_t uiRange = pCbCtx->m_uiRange;
+ uint32_t uiRangeLps = g_kuiCabacRangeLps[kiState][(uiRange & 0xff) >> 6];
+ uiRange -= uiRangeLps;
+ pCbCtx->m_sStateCtx[iCtx].Set (g_kuiStateTransTable[kiState][0],
+ pCbCtx->m_sStateCtx[iCtx].Mps() ^ (kiState == 0));
-void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin) {
- pCbCtx->m_uiLow <<= 1;
- if (uiBin) {
- pCbCtx->m_uiLow += pCbCtx->m_uiRange;
- }
- if (pCbCtx->m_uiLow >= 1024) {
- WelsCabacPutBit (pCbCtx, 1);
- pCbCtx->m_uiLow -= 1024;
- } else {
- if (pCbCtx->m_uiLow < 512)
- WelsCabacPutBit (pCbCtx, 0);
- else {
- pCbCtx->m_uiLow -= 512;
- pCbCtx->m_iBitsOutstanding++;
- }
- }
- pCbCtx->m_uiBinCountsInNalUnits++;
+ WelsCabacEncodeUpdateLow_ (pCbCtx);
+ pCbCtx->m_uiLow += uiRange;
+
+ const int32_t kiRenormAmount = g_kiClz5Table[uiRangeLps >> 3];
+ pCbCtx->m_uiRange = uiRangeLps << kiRenormAmount;
+ pCbCtx->m_iRenormCnt = kiRenormAmount;
}
+
void WelsCabacEncodeTerminate (SCabacCtx* pCbCtx, uint32_t uiBin) {
pCbCtx->m_uiRange -= 2;
if (uiBin) {
+ WelsCabacEncodeUpdateLow_ (pCbCtx);
pCbCtx->m_uiLow += pCbCtx->m_uiRange;
- pCbCtx->m_uiRange = 2;
- WelsCabacEncodeRenorm (pCbCtx);
- WelsCabacPutBit (pCbCtx, ((pCbCtx->m_uiLow >> 9) & 1));
- int32_t iLastTwoBits = (((pCbCtx->m_uiLow >> 7) & 3) | 1);
- pCbCtx->m_uData = (pCbCtx->m_uData << 2) | iLastTwoBits;
- pCbCtx->m_uiBitsUsed += 2;
+
+ const int32_t kiRenormAmount = 7;
+ pCbCtx->m_uiRange = 2 << kiRenormAmount;
+ pCbCtx->m_iRenormCnt = kiRenormAmount;
+
+ WelsCabacEncodeUpdateLow_ (pCbCtx);
+ pCbCtx->m_uiLow |= 0x80;
} else {
- WelsCabacEncodeRenorm (pCbCtx);
+ const int32_t kiRenormAmount = pCbCtx->m_uiRange >> 8 ^ 1;
+ pCbCtx->m_uiRange = pCbCtx->m_uiRange << kiRenormAmount;
+ pCbCtx->m_iRenormCnt += kiRenormAmount;
}
- pCbCtx->m_uiBinCountsInNalUnits++;
}
void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVal) {
int32_t iSufS = uiVal;
@@ -215,22 +184,18 @@
void WelsCabacEncodeFlush (SCabacCtx* pCbCtx) {
WelsCabacEncodeTerminate (pCbCtx, 1);
- while (pCbCtx->m_uiBitsUsed > 0) {
- if (pCbCtx->m_uiBitsUsed > 8) {
- pCbCtx->m_uiBitsUsed -= 8;
- uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
- pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
- *pCbCtx->m_pBufCur ++ = uiByte;
- } else {
- if (pCbCtx->m_uiBitsUsed == 8) {
- *pCbCtx->m_pBufCur ++ = pCbCtx->m_uData & 0xff;
- } else {
- *pCbCtx->m_pBufCur ++ = (pCbCtx->m_uData << (8 - pCbCtx->m_uiBitsUsed));
- }
- pCbCtx->m_uiBitsUsed = 0;
- }
- }
+ cabac_low_t uiLow = pCbCtx->m_uiLow;
+ int32_t iLowBitCnt = pCbCtx->m_iLowBitCnt;
+ uint8_t* pBufCur = pCbCtx->m_pBufCur;
+
+ uiLow <<= CABAC_LOW_WIDTH - 1 - iLowBitCnt;
+ if (uiLow & cabac_low_t (1) << (CABAC_LOW_WIDTH - 1))
+ PropagateCarry (pBufCur, pCbCtx->m_pBufStart);
+ for (; (iLowBitCnt -= 8) >= 0; uiLow <<= 8)
+ *pBufCur++ = uiLow >> (CABAC_LOW_WIDTH - 9);
+
+ pCbCtx->m_pBufCur = pBufCur;
}
uint8_t* WelsCabacEncodeGetPtr (SCabacCtx* pCbCtx) {
--- a/codec/encoder/core/src/svc_set_mb_syn_cabac.cpp
+++ b/codec/encoder/core/src/svc_set_mb_syn_cabac.cpp
@@ -41,8 +41,10 @@
#include "set_mb_syn_cabac.h"
#include "svc_enc_golomb.h"
-namespace WelsEnc {
+using namespace WelsEnc;
+namespace {
+
static const uint16_t uiSignificantCoeffFlagOffset[5] = {0, 15, 29, 44, 47};
static const uint16_t uiLastCoeffFlagOffset[5] = {0, 15, 29, 44, 47};
static const uint16_t uiCoeffAbsLevelMinus1Offset[5] = {0, 10, 20, 30, 39};
@@ -455,21 +457,17 @@
ECtxBlockCat eCtxBlockCat, int16_t iIdx, int16_t iNonZeroCount, int16_t* pBlock, int16_t iEndIdx) {
int32_t iCtx = WelsGetMbCtxCabac (pMbCache, pCurMb, iMbWidth, eCtxBlockCat, iIdx);
if (iNonZeroCount) {
- ENFORCE_STACK_ALIGN_1D (int16_t, iAbsLevel, 16, 16);
- ENFORCE_STACK_ALIGN_1D (int16_t, iSignLevel, 16, 16);
+ int16_t iLevel[16];
const int32_t iCtxSig = 105 + uiSignificantCoeffFlagOffset[eCtxBlockCat];
const int32_t iCtxLast = 166 + uiLastCoeffFlagOffset[eCtxBlockCat];
const int32_t iCtxLevel = 227 + uiCoeffAbsLevelMinus1Offset[eCtxBlockCat];
int32_t iNonZeroIdx = 0;
int32_t i = 0;
- int32_t iNumAbsLevelEq1 = 0;
- int32_t iNumAbsLevelGt1 = 0;
WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
while (1) {
if (pBlock[i]) {
- iSignLevel[iNonZeroIdx] = pBlock[i] < 0;
- iAbsLevel[iNonZeroIdx] = WELS_ABS (pBlock[i]) - 1;
+ iLevel[iNonZeroIdx] = pBlock[i];
iNonZeroIdx++;
WelsCabacEncodeDecision (pCabacCtx, iCtxSig + i, 1);
@@ -483,33 +481,38 @@
WelsCabacEncodeDecision (pCabacCtx, iCtxSig + i, 0);
i++;
if (i == iEndIdx) {
- iSignLevel[iNonZeroIdx] = pBlock[i] < 0;
- iAbsLevel[iNonZeroIdx] = WELS_ABS (pBlock[i]) - 1;
+ iLevel[iNonZeroIdx] = pBlock[i];
iNonZeroIdx++;
break;
}
}
+
+ int32_t iNumAbsLevelGt1 = 0;
+ int32_t iCtx1 = iCtxLevel + 1;
+
do {
int32_t iPrefix = 0;
iNonZeroIdx--;
- iPrefix = WELS_MIN (iAbsLevel[iNonZeroIdx], 14);
+ iPrefix = WELS_ABS (iLevel[iNonZeroIdx]) - 1;
if (iPrefix) {
- iCtx = iCtxLevel + ((iNumAbsLevelGt1 != 0) ? 0 : WELS_MIN (4, 1 + iNumAbsLevelEq1));
+ iPrefix = WELS_MIN (iPrefix, 14);
+ iCtx = WELS_MIN (iCtxLevel + 4, iCtx1);
WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
- iCtx = iCtxLevel + 5 + WELS_MIN (4 - (eCtxBlockCat == CHROMA_DC), iNumAbsLevelGt1);
- for (i = 0; i < iPrefix - 1; i++)
+ iNumAbsLevelGt1++;
+ iCtx = iCtxLevel + 4 + WELS_MIN (5 - (eCtxBlockCat == CHROMA_DC), iNumAbsLevelGt1);
+ for (i = 1; i < iPrefix; i++)
WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
- if (iPrefix < 14)
+ if (WELS_ABS (iLevel[iNonZeroIdx]) < 15)
WelsCabacEncodeDecision (pCabacCtx, iCtx, 0);
else
- WelsCabacEncodeUeBypass (pCabacCtx, 0, iAbsLevel[iNonZeroIdx] - 14);
- iNumAbsLevelGt1++;
+ WelsCabacEncodeUeBypass (pCabacCtx, 0, WELS_ABS (iLevel[iNonZeroIdx]) - 15);
+ iCtx1 = iCtxLevel;
} else {
- iCtx = iCtxLevel + ((iNumAbsLevelGt1 != 0) ? 0 : WELS_MIN (4, 1 + iNumAbsLevelEq1));
+ iCtx = WELS_MIN (iCtxLevel + 4, iCtx1);
WelsCabacEncodeDecision (pCabacCtx, iCtx, 0);
- iNumAbsLevelEq1++;
+ iCtx1 += iNumAbsLevelGt1 == 0;
}
- WelsCabacEncodeBypassOne (pCabacCtx, iSignLevel[iNonZeroIdx]);
+ WelsCabacEncodeBypassOne (pCabacCtx, iLevel[iNonZeroIdx] < 0);
} while (iNonZeroIdx > 0);
} else {
@@ -519,12 +522,10 @@
}
int32_t WelsCalNonZeroCount2x2Block (int16_t* pBlock) {
- int32_t iCount = 0;
- for (int16_t i = 0; i < 4; i++) {
- if (pBlock[i])
- iCount++;
- }
- return iCount;
+ return (pBlock[0] != 0)
+ + (pBlock[1] != 0)
+ + (pBlock[2] != 0)
+ + (pBlock[3] != 0);
}
int32_t WelsWriteMbResidualCabac (SWelsFuncPtrList* pFuncList, SSlice* pSlice, SMbCache* sMbCacheInfo, SMB* pCurMb,
SCabacCtx* pCabacCtx,
@@ -617,6 +618,10 @@
}
return 0;
}
+
+} // anon ns.
+
+namespace WelsEnc {
void WelsInitSliceCabac (sWelsEncCtx* pEncCtx, SSlice* pSlice) {
/* alignment needed */