shithub: openh264

Download patch

ref: ed133d4c3deaee9367f4fdef2a6f55e464866f34
parent: b700b67bbad51bf326ace75954e056e0c8a6c5f4
author: Sindre Aamås <saamas@cisco.com>
date: Mon Oct 12 13:59:08 EDT 2015

[Encoder] CABAC optimizations

~2.4x speedup (time attributed to all CABAC-related fuctions) on x86
(Ivy Bridge) with GCC version 4.9.2 (Debian 4.9.2-10).

~1.3x overall faster encode on a quick 720p30 6Mbps test.

Reviewed at https://rbcommons.com/s/OpenH264/r/1347/

--- a/codec/encoder/core/inc/set_mb_syn_cabac.h
+++ b/codec/encoder/core/inc/set_mb_syn_cabac.h
@@ -50,29 +50,33 @@
 
 #define  WELS_QP_MAX    51
 
+typedef uint64_t cabac_low_t;
+enum { CABAC_LOW_WIDTH = sizeof (cabac_low_t) / sizeof (uint8_t) * 8 };
+
 typedef struct TagStateCtx {
-  uint8_t   m_uiState;
-  uint8_t   m_uiValMps;
+  // Packed representation of state and MPS as state << 1 | MPS.
+  uint8_t   m_uiStateMps;
+
+  uint8_t Mps()   const { return m_uiStateMps  & 1; }
+  uint8_t State() const { return m_uiStateMps >> 1; }
+  void Set (uint8_t uiState, uint8_t uiMps) { m_uiStateMps = uiState * 2 + uiMps; }
 } SStateCtx;
 typedef struct TagCabacCtx {
-  uint32_t  m_uiLow;
+  cabac_low_t m_uiLow;
+  int32_t   m_iLowBitCnt;
+  int32_t   m_iRenormCnt;
   uint32_t  m_uiRange;
   SStateCtx   m_sStateCtx[WELS_CONTEXT_COUNT];
   uint8_t*   m_pBufStart;
   uint8_t*   m_pBufEnd;
   uint8_t*   m_pBufCur;
-  uint8_t  m_iBitsOutstanding;
-  uint32_t  m_uData;
-  uint32_t  m_uiBitsUsed;
-  uint32_t  m_iFirstFlag;
-  uint32_t  m_uiBinCountsInNalUnits;
 } SCabacCtx;
 
 
 void WelsCabacContextInit (void* pCtx, SCabacCtx* pCbCtx, int32_t iModel);
 void WelsCabacEncodeInit (SCabacCtx* pCbCtx, uint8_t* pBuf,  uint8_t* pEnd);
-void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin);
-void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin);
+inline void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin);
+inline void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin);
 void WelsCabacEncodeTerminate (SCabacCtx* pCbCtx, uint32_t uiBin);
 void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVal);
 void WelsCabacEncodeFlush (SCabacCtx* pCbCtx);
@@ -80,6 +84,44 @@
 int32_t  WriteBlockResidualCabac (void* pEncCtx,  int16_t* pCoffLevel, int32_t iEndIdx,
                                   int32_t iCalRunLevelFlag,
                                   int32_t iResidualProperty, int8_t iNC, SBitStringAux* pBs);
+
+
+// private functions used by public inline functions.
+void WelsCabacEncodeDecisionLps_ (SCabacCtx* pCbCtx, int32_t iCtx);
+void WelsCabacEncodeUpdateLowNontrivial_ (SCabacCtx* pCbCtx);
+inline void WelsCabacEncodeUpdateLow_ (SCabacCtx* pCbCtx) {
+  if (pCbCtx->m_iLowBitCnt + pCbCtx->m_iRenormCnt < CABAC_LOW_WIDTH) {
+    pCbCtx->m_iLowBitCnt  += pCbCtx->m_iRenormCnt;
+    pCbCtx->m_uiLow      <<= pCbCtx->m_iRenormCnt;
+  } else {
+    WelsCabacEncodeUpdateLowNontrivial_ (pCbCtx);
+  }
+  pCbCtx->m_iRenormCnt = 0;
+}
+
+// inline function definitions.
+void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin) {
+  if (uiBin == pCbCtx->m_sStateCtx[iCtx].Mps()) {
+    const int32_t kiState = pCbCtx->m_sStateCtx[iCtx].State();
+    uint32_t uiRange = pCbCtx->m_uiRange;
+    uint32_t uiRangeLps = g_kuiCabacRangeLps[kiState][(uiRange & 0xff) >> 6];
+    uiRange -= uiRangeLps;
+
+    const int32_t kiRenormAmount = uiRange >> 8 ^ 1;
+    pCbCtx->m_uiRange = uiRange << kiRenormAmount;
+    pCbCtx->m_iRenormCnt += kiRenormAmount;
+    pCbCtx->m_sStateCtx[iCtx].Set (g_kuiStateTransTable[kiState][1], uiBin);
+  } else {
+    WelsCabacEncodeDecisionLps_ (pCbCtx, iCtx);
+  }
+}
+
+void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin) {
+  const uint32_t kuiBinBitmask = -uiBin;
+  pCbCtx->m_iRenormCnt++;
+  WelsCabacEncodeUpdateLow_ (pCbCtx);
+  pCbCtx->m_uiLow += kuiBinBitmask & pCbCtx->m_uiRange;
+}
 
 }
 #endif
--- a/codec/encoder/core/src/set_mb_syn_cabac.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cabac.cpp
@@ -42,10 +42,25 @@
 #include "macros.h"
 #include "set_mb_syn_cabac.h"
 #include "encoder.h"
+#include "golomb_common.h"
 
-namespace WelsEnc {
+namespace {
 
+const int8_t g_kiClz5Table[32] = {
+  6, 5, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
 
+void PropagateCarry (uint8_t* pBufCur, uint8_t* pBufStart) {
+  for (; pBufCur > pBufStart; --pBufCur)
+    if (++*(pBufCur - 1))
+      break;
+}
+
+} // anon ns.
+
+namespace WelsEnc {
+
 void WelsCabacInit (void* pCtx) {
   sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
   for (int32_t iModel = 0; iModel < 4; iModel++) {
@@ -63,8 +78,7 @@
           uiStateIdx = iPreCtxState - 64;
           uiValMps = 1;
         }
-        pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].m_uiState = uiStateIdx;
-        pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].m_uiValMps = uiValMps;
+        pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].Set (uiStateIdx, uiValMps);
       }
   }
 }
@@ -79,121 +93,76 @@
 
 void  WelsCabacEncodeInit (SCabacCtx* pCbCtx, uint8_t* pBuf,  uint8_t* pEnd) {
   pCbCtx->m_uiLow     = 0;
+  pCbCtx->m_iLowBitCnt = 9;
+  pCbCtx->m_iRenormCnt = 0;
   pCbCtx->m_uiRange   = 510;
-  pCbCtx->m_iBitsOutstanding = 0;
-  pCbCtx->m_uData = 0;
-  pCbCtx->m_uiBitsUsed = 0;
-  pCbCtx->m_iFirstFlag = 1;
   pCbCtx->m_pBufStart = pBuf;
   pCbCtx->m_pBufEnd = pEnd;
   pCbCtx->m_pBufCur = pBuf;
-  pCbCtx->m_uiBinCountsInNalUnits = 0;
 }
 
-void WelsCabacPutBit (SCabacCtx* pCbCtx, uint32_t iValue) {
-  if (pCbCtx->m_iFirstFlag != 0) {
-    pCbCtx->m_iFirstFlag = 0;
-  } else {
-    pCbCtx->m_uData = (pCbCtx->m_uData << 1) | iValue;
-    pCbCtx->m_uiBitsUsed++;
-  }
-  if (pCbCtx->m_iBitsOutstanding == 0) {
-    while (pCbCtx->m_uiBitsUsed >= 8) {
-      pCbCtx->m_uiBitsUsed -= 8;
-      uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
-      if (pCbCtx->m_uiBitsUsed == 0)
-        pCbCtx->m_uData = 0;
-      else
-        pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
-      *pCbCtx->m_pBufCur ++ = uiByte;
-    }
-  } else {
+void WelsCabacEncodeUpdateLowNontrivial_ (SCabacCtx* pCbCtx) {
+  int32_t iLowBitCnt = pCbCtx->m_iLowBitCnt;
+  int32_t iRenormCnt = pCbCtx->m_iRenormCnt;
+  cabac_low_t uiLow = pCbCtx->m_uiLow;
 
-    while (pCbCtx->m_iBitsOutstanding > 0) {
-      pCbCtx->m_uData = (pCbCtx->m_uData << 1) | (1 - iValue);
-      pCbCtx->m_iBitsOutstanding--;
-      pCbCtx->m_uiBitsUsed++;
-      while (pCbCtx->m_uiBitsUsed >= 8) {
-        pCbCtx->m_uiBitsUsed -= 8;
-        uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
-        if (pCbCtx->m_uiBitsUsed == 0)
-          pCbCtx->m_uData = 0;
-        else
-          pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
-        *pCbCtx->m_pBufCur ++ = uiByte;
-      }
+  do {
+    uint8_t* pBufCur = pCbCtx->m_pBufCur;
+    const int32_t kiInc = CABAC_LOW_WIDTH - 1 - iLowBitCnt;
+
+    uiLow <<= kiInc;
+    if (uiLow & cabac_low_t (1) << (CABAC_LOW_WIDTH - 1))
+      PropagateCarry (pBufCur, pCbCtx->m_pBufStart);
+
+    if (CABAC_LOW_WIDTH > 32) {
+      WRITE_BE_32 (pBufCur, uiLow >> 31);
+      pBufCur += 4;
     }
-  }
-}
-void WelsCabacEncodeRenorm (SCabacCtx* pCbCtx) {
-  while (pCbCtx->m_uiRange < 256) {
-    if (pCbCtx->m_uiLow < 256) {
-      WelsCabacPutBit (pCbCtx, 0);
-    } else {
-      if (pCbCtx->m_uiLow >= 512) {
-        pCbCtx->m_uiLow -= 512;
-        WelsCabacPutBit (pCbCtx, 1);
-      } else {
-        pCbCtx->m_uiLow -= 256;
-        pCbCtx->m_iBitsOutstanding++;
-      }
-    }
-    pCbCtx->m_uiRange <<= 1;
-    pCbCtx->m_uiLow <<= 1;
-  }
+    *pBufCur++ = uiLow >> 23;
+    *pBufCur++ = uiLow >> 15;
+    iRenormCnt -= kiInc;
+    iLowBitCnt = 15;
+    uiLow &= (1u << iLowBitCnt) - 1;
+    pCbCtx->m_pBufCur = pBufCur;
+  } while (iLowBitCnt + iRenormCnt > CABAC_LOW_WIDTH - 1);
+
+  pCbCtx->m_iLowBitCnt = iLowBitCnt + iRenormCnt;
+  pCbCtx->m_uiLow = uiLow << iRenormCnt;
 }
-void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin) {
-  uint8_t uiState = pCbCtx->m_sStateCtx[iCtx].m_uiState;
-  uint8_t uiValMps = pCbCtx->m_sStateCtx[iCtx].m_uiValMps;
-  uint32_t uiRangeLps = g_kuiCabacRangeLps[uiState][ (pCbCtx->m_uiRange >> 6) & 3];
 
-  pCbCtx->m_uiRange -= uiRangeLps;
-  if (uiBin != uiValMps) { //LPS
-    pCbCtx->m_uiLow += pCbCtx->m_uiRange;
-    pCbCtx->m_uiRange = uiRangeLps;
-    if (uiState == 0)
-      uiValMps = 1 - uiValMps;
-    pCbCtx->m_sStateCtx[iCtx].m_uiState = g_kuiStateTransTable[uiState][0];
-    pCbCtx->m_sStateCtx[iCtx].m_uiValMps = uiValMps;
-  } else {
-    pCbCtx->m_sStateCtx[iCtx].m_uiState = g_kuiStateTransTable[uiState][1];
-  }
-  WelsCabacEncodeRenorm (pCbCtx);
-  pCbCtx->m_uiBinCountsInNalUnits++;
-}
+void WelsCabacEncodeDecisionLps_ (SCabacCtx* pCbCtx, int32_t iCtx) {
+  const int32_t kiState = pCbCtx->m_sStateCtx[iCtx].State();
+  uint32_t uiRange = pCbCtx->m_uiRange;
+  uint32_t uiRangeLps = g_kuiCabacRangeLps[kiState][(uiRange & 0xff) >> 6];
+  uiRange -= uiRangeLps;
+  pCbCtx->m_sStateCtx[iCtx].Set (g_kuiStateTransTable[kiState][0],
+                                 pCbCtx->m_sStateCtx[iCtx].Mps() ^ (kiState == 0));
 
-void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin) {
-  pCbCtx->m_uiLow <<= 1;
-  if (uiBin) {
-    pCbCtx->m_uiLow += pCbCtx->m_uiRange;
-  }
-  if (pCbCtx->m_uiLow >= 1024) {
-    WelsCabacPutBit (pCbCtx, 1);
-    pCbCtx->m_uiLow -= 1024;
-  } else {
-    if (pCbCtx->m_uiLow < 512)
-      WelsCabacPutBit (pCbCtx, 0);
-    else {
-      pCbCtx->m_uiLow -= 512;
-      pCbCtx->m_iBitsOutstanding++;
-    }
-  }
-  pCbCtx->m_uiBinCountsInNalUnits++;
+  WelsCabacEncodeUpdateLow_ (pCbCtx);
+  pCbCtx->m_uiLow += uiRange;
+
+  const int32_t kiRenormAmount = g_kiClz5Table[uiRangeLps >> 3];
+  pCbCtx->m_uiRange = uiRangeLps << kiRenormAmount;
+  pCbCtx->m_iRenormCnt = kiRenormAmount;
 }
+
 void WelsCabacEncodeTerminate (SCabacCtx* pCbCtx, uint32_t uiBin) {
   pCbCtx->m_uiRange -= 2;
   if (uiBin) {
+    WelsCabacEncodeUpdateLow_ (pCbCtx);
     pCbCtx->m_uiLow  += pCbCtx->m_uiRange;
-    pCbCtx->m_uiRange = 2;
-    WelsCabacEncodeRenorm (pCbCtx);
-    WelsCabacPutBit (pCbCtx, ((pCbCtx->m_uiLow >> 9) & 1));
-    int32_t iLastTwoBits = (((pCbCtx->m_uiLow >> 7) & 3) | 1);
-    pCbCtx->m_uData = (pCbCtx->m_uData << 2) | iLastTwoBits;
-    pCbCtx->m_uiBitsUsed += 2;
+
+    const int32_t kiRenormAmount = 7;
+    pCbCtx->m_uiRange = 2 << kiRenormAmount;
+    pCbCtx->m_iRenormCnt = kiRenormAmount;
+
+    WelsCabacEncodeUpdateLow_ (pCbCtx);
+    pCbCtx->m_uiLow |= 0x80;
   } else {
-    WelsCabacEncodeRenorm (pCbCtx);
+    const int32_t kiRenormAmount = pCbCtx->m_uiRange >> 8 ^ 1;
+    pCbCtx->m_uiRange = pCbCtx->m_uiRange << kiRenormAmount;
+    pCbCtx->m_iRenormCnt += kiRenormAmount;
   }
-  pCbCtx->m_uiBinCountsInNalUnits++;
 }
 void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVal) {
   int32_t iSufS = uiVal;
@@ -215,22 +184,18 @@
 
 void WelsCabacEncodeFlush (SCabacCtx* pCbCtx) {
   WelsCabacEncodeTerminate (pCbCtx, 1);
-  while (pCbCtx->m_uiBitsUsed > 0) {
-    if (pCbCtx->m_uiBitsUsed > 8) {
-      pCbCtx->m_uiBitsUsed -= 8;
-      uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
-      pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
-      *pCbCtx->m_pBufCur ++ = uiByte;
-    } else {
-      if (pCbCtx->m_uiBitsUsed == 8) {
-        *pCbCtx->m_pBufCur ++ = pCbCtx->m_uData & 0xff;
-      } else {
-        *pCbCtx->m_pBufCur ++ = (pCbCtx->m_uData << (8 - pCbCtx->m_uiBitsUsed));
-      }
-      pCbCtx->m_uiBitsUsed = 0;
-    }
-  }
 
+  cabac_low_t uiLow = pCbCtx->m_uiLow;
+  int32_t iLowBitCnt = pCbCtx->m_iLowBitCnt;
+  uint8_t* pBufCur = pCbCtx->m_pBufCur;
+
+  uiLow <<= CABAC_LOW_WIDTH - 1 - iLowBitCnt;
+  if (uiLow & cabac_low_t (1) << (CABAC_LOW_WIDTH - 1))
+    PropagateCarry (pBufCur, pCbCtx->m_pBufStart);
+  for (; (iLowBitCnt -= 8) >= 0; uiLow <<= 8)
+    *pBufCur++ = uiLow >> (CABAC_LOW_WIDTH - 9);
+
+  pCbCtx->m_pBufCur = pBufCur;
 }
 
 uint8_t* WelsCabacEncodeGetPtr (SCabacCtx* pCbCtx) {
--- a/codec/encoder/core/src/svc_set_mb_syn_cabac.cpp
+++ b/codec/encoder/core/src/svc_set_mb_syn_cabac.cpp
@@ -41,8 +41,10 @@
 #include "set_mb_syn_cabac.h"
 #include "svc_enc_golomb.h"
 
-namespace WelsEnc {
+using namespace WelsEnc;
 
+namespace {
+
 static const uint16_t uiSignificantCoeffFlagOffset[5] = {0, 15, 29, 44, 47};
 static const uint16_t uiLastCoeffFlagOffset[5] = {0, 15, 29, 44, 47};
 static const uint16_t uiCoeffAbsLevelMinus1Offset[5] = {0, 10, 20, 30, 39};
@@ -455,21 +457,17 @@
                                    ECtxBlockCat eCtxBlockCat, int16_t  iIdx, int16_t iNonZeroCount, int16_t* pBlock, int16_t iEndIdx) {
   int32_t iCtx = WelsGetMbCtxCabac (pMbCache, pCurMb, iMbWidth, eCtxBlockCat, iIdx);
   if (iNonZeroCount) {
-    ENFORCE_STACK_ALIGN_1D (int16_t, iAbsLevel, 16, 16);
-    ENFORCE_STACK_ALIGN_1D (int16_t, iSignLevel, 16, 16);
+    int16_t iLevel[16];
     const int32_t iCtxSig = 105 + uiSignificantCoeffFlagOffset[eCtxBlockCat];
     const int32_t iCtxLast = 166 + uiLastCoeffFlagOffset[eCtxBlockCat];
     const int32_t iCtxLevel = 227 + uiCoeffAbsLevelMinus1Offset[eCtxBlockCat];
     int32_t iNonZeroIdx = 0;
     int32_t i = 0;
-    int32_t iNumAbsLevelEq1 = 0;
-    int32_t iNumAbsLevelGt1 = 0;
 
     WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
     while (1) {
       if (pBlock[i]) {
-        iSignLevel[iNonZeroIdx] = pBlock[i] < 0;
-        iAbsLevel[iNonZeroIdx] = WELS_ABS (pBlock[i]) - 1;
+        iLevel[iNonZeroIdx] = pBlock[i];
 
         iNonZeroIdx++;
         WelsCabacEncodeDecision (pCabacCtx, iCtxSig + i, 1);
@@ -483,33 +481,38 @@
         WelsCabacEncodeDecision (pCabacCtx, iCtxSig + i, 0);
       i++;
       if (i == iEndIdx) {
-        iSignLevel[iNonZeroIdx]   = pBlock[i] < 0;
-        iAbsLevel[iNonZeroIdx] = WELS_ABS (pBlock[i]) - 1;
+        iLevel[iNonZeroIdx] = pBlock[i];
         iNonZeroIdx++;
         break;
       }
     }
+
+    int32_t iNumAbsLevelGt1 = 0;
+    int32_t iCtx1 = iCtxLevel + 1;
+
     do {
       int32_t iPrefix = 0;
       iNonZeroIdx--;
-      iPrefix = WELS_MIN (iAbsLevel[iNonZeroIdx], 14);
+      iPrefix = WELS_ABS (iLevel[iNonZeroIdx]) - 1;
       if (iPrefix) {
-        iCtx = iCtxLevel + ((iNumAbsLevelGt1 != 0) ? 0 : WELS_MIN (4, 1 + iNumAbsLevelEq1));
+        iPrefix = WELS_MIN (iPrefix, 14);
+        iCtx = WELS_MIN (iCtxLevel + 4, iCtx1);
         WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
-        iCtx = iCtxLevel + 5 + WELS_MIN (4 - (eCtxBlockCat == CHROMA_DC), iNumAbsLevelGt1);
-        for (i = 0; i < iPrefix - 1; i++)
+        iNumAbsLevelGt1++;
+        iCtx = iCtxLevel + 4 + WELS_MIN (5 - (eCtxBlockCat == CHROMA_DC), iNumAbsLevelGt1);
+        for (i = 1; i < iPrefix; i++)
           WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
-        if (iPrefix < 14)
+        if (WELS_ABS (iLevel[iNonZeroIdx]) < 15)
           WelsCabacEncodeDecision (pCabacCtx, iCtx, 0);
         else
-          WelsCabacEncodeUeBypass (pCabacCtx, 0, iAbsLevel[iNonZeroIdx] - 14);
-        iNumAbsLevelGt1++;
+          WelsCabacEncodeUeBypass (pCabacCtx, 0, WELS_ABS (iLevel[iNonZeroIdx]) - 15);
+        iCtx1 = iCtxLevel;
       } else {
-        iCtx = iCtxLevel + ((iNumAbsLevelGt1 != 0) ? 0 : WELS_MIN (4, 1 + iNumAbsLevelEq1));
+        iCtx = WELS_MIN (iCtxLevel + 4, iCtx1);
         WelsCabacEncodeDecision (pCabacCtx, iCtx, 0);
-        iNumAbsLevelEq1++;
+        iCtx1 += iNumAbsLevelGt1 == 0;
       }
-      WelsCabacEncodeBypassOne (pCabacCtx, iSignLevel[iNonZeroIdx]);
+      WelsCabacEncodeBypassOne (pCabacCtx, iLevel[iNonZeroIdx] < 0);
     } while (iNonZeroIdx > 0);
 
   } else {
@@ -519,12 +522,10 @@
 
 }
 int32_t WelsCalNonZeroCount2x2Block (int16_t* pBlock) {
-  int32_t iCount = 0;
-  for (int16_t i = 0; i < 4; i++) {
-    if (pBlock[i])
-      iCount++;
-  }
-  return iCount;
+  return (pBlock[0] != 0)
+       + (pBlock[1] != 0)
+       + (pBlock[2] != 0)
+       + (pBlock[3] != 0);
 }
 int32_t WelsWriteMbResidualCabac (SWelsFuncPtrList* pFuncList, SSlice* pSlice, SMbCache* sMbCacheInfo, SMB* pCurMb,
                                   SCabacCtx* pCabacCtx,
@@ -617,6 +618,10 @@
   }
   return 0;
 }
+
+} // anon ns.
+
+namespace WelsEnc {
 
 void WelsInitSliceCabac (sWelsEncCtx* pEncCtx, SSlice* pSlice) {
   /* alignment needed */