shithub: openh264

Download patch

ref: 93af25eaefb49363ca04ab9b45abb9552e6e9980
parent: 2ca6af54b97040c87778bfb8ebc43a19d5a63832
parent: 3cb417f14a381609252d5c7b9d01ac877b982383
author: sijchen <sijchen@cisco.com>
date: Fri Jun 19 09:37:41 EDT 2015

Merge pull request #1993 from huili2/dec_mem_stat

add dec memory check

--- a/codec/decoder/core/inc/decoder.h
+++ b/codec/decoder/core/inc/decoder.h
@@ -144,7 +144,7 @@
 //update decoder statistics information
 void UpdateDecStat (PWelsDecoderContext pCtx, const bool kbOutput);
 //Destroy picutre buffer
-void DestroyPicBuff (PPicBuff* ppPicBuf);
+void DestroyPicBuff (PPicBuff* ppPicBuf, CMemoryAlign* pMa);
 #ifdef __cplusplus
 }
 #endif//__cplusplus
--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -55,6 +55,7 @@
 #include "mb_cache.h"
 #include "expand_pic.h"
 #include "mc.h"
+#include "memory_align.h"
 
 namespace WelsDec {
 #define MAX_PRED_MODE_ID_I16x16  3
@@ -449,6 +450,7 @@
   bool bDequantCoeff4x4Init;
   bool bSpsLatePps;
   bool bUseScalingList;
+  CMemoryAlign*     pMemAlign;
 } SWelsDecoderContext, *PWelsDecoderContext;
 
 static inline void ResetActiveSPSForEachLayer (PWelsDecoderContext pCtx) {
--- a/codec/decoder/core/inc/fmo.h
+++ b/codec/decoder/core/inc/fmo.h
@@ -43,6 +43,7 @@
 #include "typedefs.h"
 #include "wels_const.h"
 #include "parameter_sets.h"
+#include "memory_align.h"
 
 namespace WelsDec {
 
@@ -73,7 +74,7 @@
  *
  * \return  0 - successful; none 0 - failed;
  */
-int32_t InitFmo (PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight);
+int32_t InitFmo (PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight, CMemoryAlign* pMa);
 
 /*!
  * \brief   Uninitialize Wels Flexible Macroblock Ordering (FMO) list
@@ -84,7 +85,7 @@
  *
  * \return  NONE
  */
-void UninitFmoList (PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail);
+void UninitFmoList (PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail, CMemoryAlign* pMa);
 
 /*!
  * \brief   update/insert FMO parameter unit
@@ -96,7 +97,7 @@
  *
  * \return  true - update/insert successfully; false - failed;
  */
-bool FmoParamUpdate (PFmo pFmo, PSps pSps, PPps pPps, int32_t* pActiveFmoNum);
+bool FmoParamUpdate (PFmo pFmo, PSps pSps, PPps pPps, int32_t* pActiveFmoNum, CMemoryAlign* pMa);
 
 /*!
  * \brief   Get successive mb to be processed with given current mb_xy
--- a/codec/decoder/core/inc/memmgr_nal_unit.h
+++ b/codec/decoder/core/inc/memmgr_nal_unit.h
@@ -44,12 +44,13 @@
 #include "typedefs.h"
 #include "wels_common_basis.h"
 #include "nalu.h"
+#include "memory_align.h"
 
 namespace WelsDec {
 
-int32_t MemInitNalList (PAccessUnit* ppAu, const uint32_t kuiSize);
+int32_t MemInitNalList (PAccessUnit* ppAu, const uint32_t kuiSize, CMemoryAlign* pMa);
 
-int32_t MemFreeNalList (PAccessUnit* ppAu);
+int32_t MemFreeNalList (PAccessUnit* ppAu, CMemoryAlign* pMa);
 
 /*
  *  MemGetNextNal
@@ -56,7 +57,7 @@
  *  Get next NAL Unit for using.
  *  Need expand NAL Unit list if exceeding count number of available NAL Units withing an Access Unit
  */
-PNalUnit MemGetNextNal (PAccessUnit* ppAu);
+PNalUnit MemGetNextNal (PAccessUnit* ppAu, CMemoryAlign* pMa);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/src/au_parser.cpp
+++ b/codec/decoder/core/src/au_parser.cpp
@@ -264,7 +264,7 @@
   case NAL_UNIT_CODED_SLICE_IDR: {
     PAccessUnit pCurAu = NULL;
     uint32_t uiAvailNalNum;
-    pCurNal = MemGetNextNal (&pCtx->pAccessUnitList);
+    pCurNal = MemGetNextNal (&pCtx->pAccessUnitList, pCtx->pMemAlign);
     if (NULL == pCurNal) {
       WelsLog (pLogCtx, WELS_LOG_ERROR, "MemGetNextNal() fail due out of memory.");
       pCtx->iErrorCode |= dsOutOfMemory;
@@ -1138,7 +1138,9 @@
 
       //re-write subset SPS to SPS
       SBitStringAux sSubsetSpsBs;
-      uint8_t* pBsBuf = static_cast<uint8_t*> (WelsMallocz (SPS_PPS_BS_SIZE + 4,
+      CMemoryAlign* pMa = pCtx->pMemAlign;
+
+      uint8_t* pBsBuf = static_cast<uint8_t*> (pMa->WelsMallocz (SPS_PPS_BS_SIZE + 4,
                         "Temp buffer for parse only usage.")); //to reserve 4 bytes for UVLC writing buffer
       if (NULL == pBsBuf) {
         pCtx->iErrorCode |= dsOutOfMemory;
@@ -1188,7 +1190,7 @@
       RBSP2EBSP (pSpsBs->pSpsBsBuf + 5, sSubsetSpsBs.pStartBuf, iRbspSize);
       pSpsBs->uiSpsBsLen = (uint16_t) (sSubsetSpsBs.pCurBuf - sSubsetSpsBs.pStartBuf + 5);
       if (pBsBuf) {
-        WelsFree (pBsBuf, "pBsBuf for parse only usage");
+        pMa->WelsFree (pBsBuf, "pBsBuf for parse only usage");
       }
     }
   }
@@ -1572,7 +1574,7 @@
   int32_t iCountNum = 0;
   if (NULL != pCtx) {
     // Fixed memory leak due to PPS_ID might not be continuous sometimes, 1/5/2010
-    UninitFmoList (&pCtx->sFmoList[0], MAX_PPS_COUNT, pCtx->iActiveFmoNum);
+    UninitFmoList (&pCtx->sFmoList[0], MAX_PPS_COUNT, pCtx->iActiveFmoNum, pCtx->pMemAlign);
     iCountNum = pCtx->iActiveFmoNum;
     pCtx->iActiveFmoNum = 0;
   }
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -57,7 +57,7 @@
 
 extern PPicture AllocPicture (PWelsDecoderContext pCtx, const int32_t kiPicWidth, const int32_t kiPicHeight);
 
-extern void FreePicture (PPicture pPic);
+extern void FreePicture (PPicture pPic, CMemoryAlign* pMa);
 
 static int32_t CreatePicBuff (PWelsDecoderContext pCtx, PPicBuff* ppPicBuf, const int32_t kiSize,
                               const int32_t kiPicWidth, const int32_t kiPicHeight) {
@@ -67,17 +67,19 @@
     return 1;
   }
 
-  pPicBuf = (PPicBuff)WelsMallocz (sizeof (SPicBuff), "PPicBuff");
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
+  pPicBuf = (PPicBuff)pMa->WelsMallocz (sizeof (SPicBuff), "PPicBuff");
+
   if (NULL == pPicBuf) {
     return 1;
   }
 
-  pPicBuf->ppPic = (PPicture*)WelsMallocz (kiSize * sizeof (PPicture), "PPicture*");
+  pPicBuf->ppPic = (PPicture*)pMa->WelsMallocz (kiSize * sizeof (PPicture), "PPicture*");
 
   if (NULL == pPicBuf->ppPic) {
     pPicBuf->iCapacity = 0;
-    DestroyPicBuff (&pPicBuf);
+    DestroyPicBuff (&pPicBuf, pMa);
     return 1;
   }
 
@@ -86,7 +88,7 @@
     if (NULL == pPic) {
       // init capacity first for free memory
       pPicBuf->iCapacity = iPicIdx;
-      DestroyPicBuff (&pPicBuf);
+      DestroyPicBuff (&pPicBuf, pMa);
       return 1;
     }
     pPicBuf->ppPic[iPicIdx] = pPic;
@@ -109,17 +111,18 @@
     return 1;
   }
 
-  pPicNewBuf = (PPicBuff)WelsMallocz (sizeof (SPicBuff), "PPicBuff");
+  CMemoryAlign* pMa = pCtx->pMemAlign;
+  pPicNewBuf = (PPicBuff)pMa->WelsMallocz (sizeof (SPicBuff), "PPicBuff");
 
   if (NULL == pPicNewBuf) {
     return 1;
   }
 
-  pPicNewBuf->ppPic = (PPicture*)WelsMallocz (kiNewSize * sizeof (PPicture), "PPicture*");
+  pPicNewBuf->ppPic = (PPicture*)pMa->WelsMallocz (kiNewSize * sizeof (PPicture), "PPicture*");
 
   if (NULL == pPicNewBuf->ppPic) {
     pPicNewBuf->iCapacity = 0;
-    DestroyPicBuff (&pPicNewBuf);
+    DestroyPicBuff (&pPicNewBuf, pMa);
     return 1;
   }
 
@@ -129,7 +132,7 @@
     if (NULL == pPic) {
       // Set maximum capacity as the new malloc memory at the tail
       pPicNewBuf->iCapacity = iPicIdx;
-      DestroyPicBuff (&pPicNewBuf);
+      DestroyPicBuff (&pPicNewBuf, pMa);
       return 1;
     }
     pPicNewBuf->ppPic[iPicIdx] = pPic;
@@ -152,12 +155,12 @@
   }
 // remove old PicBuf
   if (pPicOldBuf->ppPic != NULL) {
-    WelsFree (pPicOldBuf->ppPic, "pPicOldBuf->queue");
+    pMa->WelsFree (pPicOldBuf->ppPic, "pPicOldBuf->queue");
     pPicOldBuf->ppPic = NULL;
   }
   pPicOldBuf->iCapacity = 0;
   pPicOldBuf->iCurrentIdx = 0;
-  WelsFree (pPicOldBuf, "pPicOldBuf");
+  pMa->WelsFree (pPicOldBuf, "pPicOldBuf");
   pPicOldBuf = NULL;
   return 0;
 }
@@ -171,17 +174,19 @@
     return 1;
   }
 
-  pPicNewBuf = (PPicBuff)WelsMallocz (sizeof (SPicBuff), "PPicBuff");
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
+  pPicNewBuf = (PPicBuff)pMa->WelsMallocz (sizeof (SPicBuff), "PPicBuff");
+
   if (NULL == pPicNewBuf) {
     return 1;
   }
 
-  pPicNewBuf->ppPic = (PPicture*)WelsMallocz (kiNewSize * sizeof (PPicture), "PPicture*");
+  pPicNewBuf->ppPic = (PPicture*)pMa->WelsMallocz (kiNewSize * sizeof (PPicture), "PPicture*");
 
   if (NULL == pPicNewBuf->ppPic) {
     pPicNewBuf->iCapacity = 0;
-    DestroyPicBuff (&pPicNewBuf);
+    DestroyPicBuff (&pPicNewBuf, pMa);
     return 1;
   }
 
@@ -207,7 +212,7 @@
   for (iPicIdx = iDelIdx; iPicIdx < kiOldSize; iPicIdx++) {
     if (iPrevPicIdx != iPicIdx) {
       if (pPicOldBuf->ppPic[iPicIdx] != NULL) {
-        FreePicture (pPicOldBuf->ppPic[iPicIdx]);
+        FreePicture (pPicOldBuf->ppPic[iPicIdx], pMa);
         pPicOldBuf->ppPic[iPicIdx] = NULL;
       }
     }
@@ -226,18 +231,18 @@
   }
   // remove old PicBuf
   if (pPicOldBuf->ppPic != NULL) {
-    WelsFree (pPicOldBuf->ppPic, "pPicOldBuf->queue");
+    pMa->WelsFree (pPicOldBuf->ppPic, "pPicOldBuf->queue");
     pPicOldBuf->ppPic = NULL;
   }
   pPicOldBuf->iCapacity = 0;
   pPicOldBuf->iCurrentIdx = 0;
-  WelsFree (pPicOldBuf, "pPicOldBuf");
+  pMa->WelsFree (pPicOldBuf, "pPicOldBuf");
   pPicOldBuf = NULL;
 
   return 0;
 }
 
-void DestroyPicBuff (PPicBuff* ppPicBuf) {
+void DestroyPicBuff (PPicBuff* ppPicBuf, CMemoryAlign* pMa) {
   PPicBuff pPicBuf = NULL;
 
   if (NULL == ppPicBuf || NULL == *ppPicBuf)
@@ -249,13 +254,13 @@
     while (iPicIdx < pPicBuf->iCapacity) {
       PPicture pPic = pPicBuf->ppPic[iPicIdx];
       if (pPic != NULL) {
-        FreePicture (pPic);
+        FreePicture (pPic, pMa);
       }
       pPic = NULL;
       ++ iPicIdx;
     }
 
-    WelsFree (pPicBuf->ppPic, "pPicBuf->queue");
+    pMa->WelsFree (pPicBuf->ppPic, "pPicBuf->queue");
 
     pPicBuf->ppPic = NULL;
   }
@@ -262,7 +267,7 @@
   pPicBuf->iCapacity = 0;
   pPicBuf->iCurrentIdx = 0;
 
-  WelsFree (pPicBuf, "pPicBuf");
+  pMa->WelsFree (pPicBuf, "pPicBuf");
 
   pPicBuf = NULL;
   *ppPicBuf = NULL;
@@ -270,10 +275,11 @@
 /*
  * fill data fields in default for decoder context
  */
-void WelsDecoderDefaults (PWelsDecoderContext pCtx, SLogContext* pLogCtx) {
+void WelsDecoderDefaults (PWelsDecoderContext pCtx, SLogContext* pLogCtx, CMemoryAlign* pMa) {
   int32_t iCpuCores               = 1;
   memset (pCtx, 0, sizeof (SWelsDecoderContext));       // fill zero first
   pCtx->sLogCtx = *pLogCtx;
+  pCtx->pMemAlign = pMa;
 
   pCtx->pArgDec                   = NULL;
 
@@ -360,6 +366,7 @@
   int32_t iListIdx              = 0;    //, mb_blocks   = 0;
   int32_t iPicQueueSize         = 0;    // adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
   bool  bNeedChangePicQueue     = true;
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
   WELS_VERIFY_RETURN_IF (ERR_INFO_INVALID_PARAM, (NULL == pCtx || kiPicWidth <= 0 || kiPicHeight <= 0))
 
@@ -404,7 +411,7 @@
     for (iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx) {
       PPicBuff* ppPic = &pCtx->pPicBuff[iListIdx];
       if (NULL != ppPic && NULL != *ppPic) {
-        DestroyPicBuff (ppPic);
+        DestroyPicBuff (ppPic, pMa);
       }
     }
 
@@ -425,7 +432,7 @@
   pCtx->pDec                = NULL;         // need prefetch a new pic due to spatial size changed
 
   if (pCtx->pCabacDecEngine == NULL)
-    pCtx->pCabacDecEngine = (SWelsCabacDecEngine*) WelsMallocz (sizeof (SWelsCabacDecEngine), "pCtx->pCabacDecEngine");
+    pCtx->pCabacDecEngine = (SWelsCabacDecEngine*) pMa->WelsMallocz (sizeof (SWelsCabacDecEngine), "pCtx->pCabacDecEngine");
   WELS_VERIFY_RETURN_IF (ERR_INFO_OUT_OF_MEMORY, (NULL == pCtx->pCabacDecEngine))
   return ERR_NONE;
 }
@@ -435,6 +442,7 @@
  */
 void WelsFreeMem (PWelsDecoderContext pCtx) {
   int32_t iListIdx = 0;
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
   /* TODO: free memory blocks introduced in avc */
   ResetFmoList (pCtx);
@@ -445,7 +453,7 @@
   for (iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx) {
     PPicBuff* pPicBuff = &pCtx->pPicBuff[iListIdx];
     if (NULL != pPicBuff && NULL != *pPicBuff) {
-      DestroyPicBuff (pPicBuff);
+      DestroyPicBuff (pPicBuff, pMa);
     }
   }
 
@@ -456,7 +464,7 @@
   pCtx->iLastImgHeightInPixel = 0;
   pCtx->bFreezeOutput = true;
   pCtx->bHaveGotMemory = false;
-  WelsFree (pCtx->pCabacDecEngine, "pCtx->pCabacDecEngine");
+  pMa->WelsFree (pCtx->pCabacDecEngine, "pCtx->pCabacDecEngine");
 }
 
 /*!
@@ -517,8 +525,10 @@
   if (NULL == pCtx || NULL == kpParam)
     return 1;
 
-  pCtx->pParam = (SDecodingParam*)WelsMallocz (sizeof (SDecodingParam), "SDecodingParam");
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
+  pCtx->pParam = (SDecodingParam*)pMa->WelsMallocz (sizeof (SDecodingParam), "SDecodingParam");
+
   if (NULL == pCtx->pParam)
     return 1;
 
@@ -565,7 +575,7 @@
   }
 
   // default
-  WelsDecoderDefaults (pCtx, pLogCtx);
+  WelsDecoderDefaults (pCtx, pLogCtx, pCtx->pMemAlign);
 
   pCtx->bParseOnly = bParseOnly;
   // open decoder
@@ -834,7 +844,10 @@
              "SyncPictureResolutionExt()::InitialDqLayersContext--buffer allocated failure.");
     pCtx->iErrorCode = dsOutOfMemory;
   }
-
+#if defined(MEMORY_MONITOR)
+  WelsLog (& (pCtx->sLogCtx), WELS_LOG_INFO, "SyncPictureResolutionExt(), overall memory usage: %llu bytes",
+           static_cast<unsigned long long> (sizeof (SWelsDecoderContext) + pCtx->pMemAlign->WelsGetMemoryUsage()));
+#endif//MEMORY_MONITOR
   return iErr;
 }
 
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -456,8 +456,10 @@
   if (pCtx == NULL)
     return ERR_INFO_INVALID_PTR;
 
+  CMemoryAlign* pMa = pCtx->pMemAlign;
+
   pCtx->iMaxBsBufferSizeInByte = MIN_ACCESS_UNIT_CAPACITY * MAX_BUFFERED_NUM;
-  if ((pCtx->sRawData.pHead = static_cast<uint8_t*> (WelsMallocz (pCtx->iMaxBsBufferSizeInByte,
+  if ((pCtx->sRawData.pHead = static_cast<uint8_t*> (pMa->WelsMallocz (pCtx->iMaxBsBufferSizeInByte,
                               "pCtx->sRawData.pHead"))) == NULL) {
     return ERR_INFO_OUT_OF_MEMORY;
   }
@@ -464,12 +466,12 @@
   pCtx->sRawData.pStartPos = pCtx->sRawData.pCurPos = pCtx->sRawData.pHead;
   pCtx->sRawData.pEnd = pCtx->sRawData.pHead + pCtx->iMaxBsBufferSizeInByte;
   if (pCtx->bParseOnly) {
-    pCtx->pParserBsInfo = static_cast<SParserBsInfo*> (WelsMallocz (sizeof (SParserBsInfo), "pCtx->pParserBsInfo"));
+    pCtx->pParserBsInfo = static_cast<SParserBsInfo*> (pMa->WelsMallocz (sizeof (SParserBsInfo), "pCtx->pParserBsInfo"));
     if (pCtx->pParserBsInfo == NULL) {
       return ERR_INFO_OUT_OF_MEMORY;
     }
     memset (pCtx->pParserBsInfo, 0, sizeof (SParserBsInfo));
-    pCtx->pParserBsInfo->pDstBuff = static_cast<uint8_t*> (WelsMallocz (MAX_ACCESS_UNIT_CAPACITY * sizeof (uint8_t),
+    pCtx->pParserBsInfo->pDstBuff = static_cast<uint8_t*> (pMa->WelsMallocz (MAX_ACCESS_UNIT_CAPACITY * sizeof (uint8_t),
                                     "pCtx->pParserBsInfo->pDstBuff"));
     if (pCtx->pParserBsInfo->pDstBuff == NULL) {
       return ERR_INFO_OUT_OF_MEMORY;
@@ -476,7 +478,7 @@
     }
     memset (pCtx->pParserBsInfo->pDstBuff, 0, MAX_ACCESS_UNIT_CAPACITY * sizeof (uint8_t));
 
-    if ((pCtx->sSavedData.pHead = static_cast<uint8_t*> (WelsMallocz (pCtx->iMaxBsBufferSizeInByte,
+    if ((pCtx->sSavedData.pHead = static_cast<uint8_t*> (pMa->WelsMallocz (pCtx->iMaxBsBufferSizeInByte,
                                   "pCtx->sSavedData.pHead"))) == NULL) {
       return ERR_INFO_OUT_OF_MEMORY;
     }
@@ -492,7 +494,8 @@
   int32_t iExpandStepShift = 1;
   int32_t iNewBuffLen = WELS_MAX ((kiSrcLen * MAX_BUFFERED_NUM), (pCtx->iMaxBsBufferSizeInByte << iExpandStepShift));
   //allocate new bs buffer
-  uint8_t* pNewBsBuff = static_cast<uint8_t*> (WelsMallocz (iNewBuffLen, "pCtx->sRawData.pHead"));
+  CMemoryAlign* pMa = pCtx->pMemAlign;
+  uint8_t* pNewBsBuff = static_cast<uint8_t*> (pMa->WelsMallocz (iNewBuffLen, "pCtx->sRawData.pHead"));
   if (pNewBsBuff == NULL)
     return ERR_INFO_OUT_OF_MEMORY;
 
@@ -510,7 +513,7 @@
   pCtx->sRawData.pStartPos = pNewBsBuff + (pCtx->sRawData.pStartPos - pCtx->sRawData.pHead);
   pCtx->sRawData.pCurPos = pNewBsBuff + (pCtx->sRawData.pCurPos - pCtx->sRawData.pHead);
   pCtx->sRawData.pEnd = pNewBsBuff + iNewBuffLen;
-  WelsFree (pCtx->sRawData.pHead, "pCtx->sRawData.pHead");
+  pMa->WelsFree (pCtx->sRawData.pHead, "pCtx->sRawData.pHead");
   pCtx->sRawData.pHead = pNewBsBuff;
   return ERR_NONE;
 }
@@ -545,7 +548,7 @@
     return ERR_INFO_INVALID_PTR;
   }
 
-  if (MemInitNalList (&pCtx->pAccessUnitList, MAX_NAL_UNIT_NUM_IN_AU) != 0)
+  if (MemInitNalList (&pCtx->pAccessUnitList, MAX_NAL_UNIT_NUM_IN_AU, pCtx->pMemAlign) != 0)
     return ERR_INFO_OUT_OF_MEMORY;
 
   if (InitBsBuffer (pCtx) != 0)
@@ -566,16 +569,18 @@
   if (pCtx == NULL)
     return;
 
+  CMemoryAlign* pMa = pCtx->pMemAlign;
+
   if (NULL != pCtx->pParam) {
-    WelsFree (pCtx->pParam, "pCtx->pParam");
+    pMa->WelsFree (pCtx->pParam, "pCtx->pParam");
 
     pCtx->pParam = NULL;
   }
 
-  MemFreeNalList (&pCtx->pAccessUnitList);
+  MemFreeNalList (&pCtx->pAccessUnitList, pMa);
 
   if (pCtx->sRawData.pHead) {
-    WelsFree (pCtx->sRawData.pHead, "pCtx->sRawData->pHead");
+    pMa->WelsFree (pCtx->sRawData.pHead, "pCtx->sRawData->pHead");
   }
   pCtx->sRawData.pHead                = NULL;
   pCtx->sRawData.pEnd                 = NULL;
@@ -583,7 +588,7 @@
   pCtx->sRawData.pCurPos              = NULL;
   if (pCtx->bParseOnly) {
     if (pCtx->sSavedData.pHead) {
-      WelsFree (pCtx->sSavedData.pHead, "pCtx->sSavedData->pHead");
+      pMa->WelsFree (pCtx->sSavedData.pHead, "pCtx->sSavedData->pHead");
     }
     pCtx->sSavedData.pHead                = NULL;
     pCtx->sSavedData.pEnd                 = NULL;
@@ -591,10 +596,10 @@
     pCtx->sSavedData.pCurPos              = NULL;
     if (pCtx->pParserBsInfo) {
       if (pCtx->pParserBsInfo->pDstBuff) {
-        WelsFree (pCtx->pParserBsInfo->pDstBuff, "pCtx->pParserBsInfo->pDstBuff");
+        pMa->WelsFree (pCtx->pParserBsInfo->pDstBuff, "pCtx->pParserBsInfo->pDstBuff");
         pCtx->pParserBsInfo->pDstBuff = NULL;
       }
-      WelsFree (pCtx->pParserBsInfo, "pCtx->pParserBsInfo");
+      pMa->WelsFree (pCtx->pParserBsInfo, "pCtx->pParserBsInfo");
       pCtx->pParserBsInfo = NULL;
     }
   }
@@ -640,7 +645,8 @@
  *  Parse slice header of bitstream in avc for storing data structure
  */
 int32_t ParseSliceHeaderSyntaxs (PWelsDecoderContext pCtx, PBitStringAux pBs, const bool kbExtensionFlag) {
-  PNalUnit const kpCurNal               = pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum - 1];
+  PNalUnit const kpCurNal               = pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum -
+                                          1];
 
   PNalUnitHeaderExt pNalHeaderExt       = NULL;
   PSliceHeader pSliceHead               = NULL;
@@ -970,7 +976,7 @@
   }
 
   bSgChangeCycleInvolved = (pPps->uiNumSliceGroups > 1 && pPps->uiSliceGroupMapType >= 3
-                             && pPps->uiSliceGroupMapType <= 5);
+                            && pPps->uiSliceGroupMapType <= 5);
   if (kbExtensionFlag && bSgChangeCycleInvolved)
     bSgChangeCycleInvolved = (bSgChangeCycleInvolved && (uiQualityId == BASE_QUALITY_ID));
   if (bSgChangeCycleInvolved) {
@@ -1030,7 +1036,8 @@
         pos.iRightOffset  = pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iRightOffset;
         pos.iBottomOffset = pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iBottomOffset * (2 - pSps->bFrameMbsOnlyFlag);
         //memcpy(&pSliceHeadExt->sScaledRefLayer, &pos, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
-        pSliceHeadExt->iScaledRefLayerPicWidthInSampleLuma  = (pSliceHead->iMbWidth << 4) - (pos.iLeftOffset + pos.iRightOffset);
+        pSliceHeadExt->iScaledRefLayerPicWidthInSampleLuma  = (pSliceHead->iMbWidth << 4) -
+            (pos.iLeftOffset + pos.iRightOffset);
         pSliceHeadExt->iScaledRefLayerPicHeightInSampleLuma = (pSliceHead->iMbHeight << 4) -
             (pos.iTopOffset + pos.iBottomOffset) / (1 + pSliceHead->bFieldPicFlag);
       }
@@ -1218,11 +1225,12 @@
       && kiMaxHeight <= pCtx->iPicHeightReq) // have same dimension memory, skipped
     return ERR_NONE;
 
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
   UninitialDqLayersContext (pCtx);
 
   do {
-    PDqLayer pDq = (PDqLayer)WelsMallocz (sizeof (SDqLayer), "PDqLayer");
+    PDqLayer pDq = (PDqLayer)pMa->WelsMallocz (sizeof (SDqLayer), "PDqLayer");
 
     if (pDq == NULL)
       return ERR_INFO_OUT_OF_MEMORY;
@@ -1229,57 +1237,63 @@
 
     memset (pDq, 0, sizeof (SDqLayer));
 
-    pCtx->sMb.pMbType[i] = (int16_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int16_t),
+    pCtx->sMb.pMbType[i] = (int16_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int16_t),
                            "pCtx->sMb.pMbType[]");
-    pCtx->sMb.pMv[i][0] = (int16_t (*)[16][2])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+    pCtx->sMb.pMv[i][0] = (int16_t (*)[16][2])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
                             int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMv[][]");
-    pCtx->sMb.pRefIndex[i][0] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+    pCtx->sMb.pRefIndex[i][0] = (int8_t (*)[MB_BLOCK4x4_NUM])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
                                 sizeof (
                                   int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[][]");
-    pCtx->sMb.pLumaQp[i] = (int8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pLumaQp[i] = (int8_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                            "pCtx->sMb.pLumaQp[]");
-    pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i] = (bool*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+    pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i] = (bool*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+        sizeof (
           bool),
         "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
-    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
                                          "pCtx->sMb.pTransformSize8x8Flag[]");
-    pCtx->sMb.pChromaQp[i] = (int8_t (*)[2])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 2,
+    pCtx->sMb.pChromaQp[i] = (int8_t (*)[2])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                               int8_t) * 2,
                              "pCtx->sMb.pChromaQp[]");
-    pCtx->sMb.pMvd[i][0] = (int16_t (*)[16][2])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+    pCtx->sMb.pMvd[i][0] = (int16_t (*)[16][2])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
                              int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMvd[][]");
-    pCtx->sMb.pCbfDc[i] = (uint16_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (uint16_t),
+    pCtx->sMb.pCbfDc[i] = (uint16_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (uint16_t),
                           "pCtx->sMb.pCbfDc[]");
-    pCtx->sMb.pNzc[i] = (int8_t (*)[24])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+    pCtx->sMb.pNzc[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
                         "pCtx->sMb.pNzc[]");
-    pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+    pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
                           "pCtx->sMb.pNzcRs[]");
-    pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+    pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth *
+                                 pCtx->sMb.iMbHeight *
                                  sizeof (int16_t) * MB_COEFF_LIST_SIZE, "pCtx->sMb.pScaledTCoeff[]");
-    pCtx->sMb.pIntraPredMode[i] = (int8_t (*)[8])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+    pCtx->sMb.pIntraPredMode[i] = (int8_t (*)[8])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
                                     int8_t) * 8,
                                   "pCtx->sMb.pIntraPredMode[]");
-    pCtx->sMb.pIntra4x4FinalMode[i] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+    pCtx->sMb.pIntra4x4FinalMode[i] = (int8_t (*)[MB_BLOCK4x4_NUM])pMa->WelsMallocz (pCtx->sMb.iMbWidth *
+                                      pCtx->sMb.iMbHeight *
                                       sizeof (int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pIntra4x4FinalMode[]");
-    pCtx->sMb.pIntraNxNAvailFlag[i] = (uint8_t (*))WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pIntraNxNAvailFlag[i] = (uint8_t (*))pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                        int8_t),
                                       "pCtx->sMb.pIntraNxNAvailFlag");
-    pCtx->sMb.pChromaPredMode[i] = (int8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pChromaPredMode[i] = (int8_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                                    "pCtx->sMb.pChromaPredMode[]");
-    pCtx->sMb.pCbp[i] = (int8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pCbp[i] = (int8_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                         "pCtx->sMb.pCbp[]");
-    pCtx->sMb.pSubMbType[i] = (int8_t (*)[MB_PARTITION_SIZE])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+    pCtx->sMb.pSubMbType[i] = (int8_t (*)[MB_PARTITION_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
                               sizeof (
                                 int8_t) * MB_PARTITION_SIZE, "pCtx->sMb.pSubMbType[]");
-    pCtx->sMb.pSliceIdc[i] = (int32_t*) WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t),
+    pCtx->sMb.pSliceIdc[i] = (int32_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t),
                              "pCtx->sMb.pSliceIdc[]"); // using int32_t for slice_idc, 4/21/2010
-    pCtx->sMb.pResidualPredFlag[i] = (int8_t*) WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pResidualPredFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                                      "pCtx->sMb.pResidualPredFlag[]");
-    //pCtx->sMb.pMotionPredFlag[i] = (uint8_t *) WelsMallocz(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(uint8_t), "pCtx->sMb.pMotionPredFlag[]");
-    pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+    //pCtx->sMb.pMotionPredFlag[i] = (uint8_t *) pMa->WelsMallocz(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(uint8_t), "pCtx->sMb.pMotionPredFlag[]");
+    pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
         int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
 
-    pCtx->sMb.pMbCorrectlyDecodedFlag[i] = (bool*) WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pMbCorrectlyDecodedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+        bool),
                                            "pCtx->sMb.pMbCorrectlyDecodedFlag[]");
-    pCtx->sMb.pMbRefConcealedFlag[i] = (bool*) WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pMbRefConcealedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
                                        "pCtx->pMbRefConcealedFlag[]");
 
     // check memory block valid due above allocated..
@@ -1322,6 +1336,7 @@
 
 void UninitialDqLayersContext (PWelsDecoderContext pCtx) {
   int32_t i = 0;
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
   do {
     PDqLayer pDq = pCtx->pDqLayersList[i];
@@ -1331,101 +1346,101 @@
     }
 
     if (pCtx->sMb.pMbType[i]) {
-      WelsFree (pCtx->sMb.pMbType[i], "pCtx->sMb.pMbType[]");
+      pMa->WelsFree (pCtx->sMb.pMbType[i], "pCtx->sMb.pMbType[]");
 
       pCtx->sMb.pMbType[i] = NULL;
     }
 
     if (pCtx->sMb.pMv[i][0]) {
-      WelsFree (pCtx->sMb.pMv[i][0], "pCtx->sMb.pMv[][]");
+      pMa->WelsFree (pCtx->sMb.pMv[i][0], "pCtx->sMb.pMv[][]");
 
       pCtx->sMb.pMv[i][0] = NULL;
     }
 
     if (pCtx->sMb.pRefIndex[i][0]) {
-      WelsFree (pCtx->sMb.pRefIndex[i][0], "pCtx->sMb.pRefIndex[][]");
+      pMa->WelsFree (pCtx->sMb.pRefIndex[i][0], "pCtx->sMb.pRefIndex[][]");
 
       pCtx->sMb.pRefIndex[i][0] = NULL;
     }
 
     if (pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i]) {
-      WelsFree (pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i], "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
+      pMa->WelsFree (pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i], "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
 
       pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i] = NULL;
     }
 
     if (pCtx->sMb.pTransformSize8x8Flag[i]) {
-      WelsFree (pCtx->sMb.pTransformSize8x8Flag[i], "pCtx->sMb.pTransformSize8x8Flag[]");
+      pMa->WelsFree (pCtx->sMb.pTransformSize8x8Flag[i], "pCtx->sMb.pTransformSize8x8Flag[]");
 
       pCtx->sMb.pTransformSize8x8Flag[i] = NULL;
     }
 
     if (pCtx->sMb.pLumaQp[i]) {
-      WelsFree (pCtx->sMb.pLumaQp[i], "pCtx->sMb.pLumaQp[]");
+      pMa->WelsFree (pCtx->sMb.pLumaQp[i], "pCtx->sMb.pLumaQp[]");
 
       pCtx->sMb.pLumaQp[i] = NULL;
     }
 
     if (pCtx->sMb.pChromaQp[i]) {
-      WelsFree (pCtx->sMb.pChromaQp[i], "pCtx->sMb.pChromaQp[]");
+      pMa->WelsFree (pCtx->sMb.pChromaQp[i], "pCtx->sMb.pChromaQp[]");
 
       pCtx->sMb.pChromaQp[i] = NULL;
     }
 
     if (pCtx->sMb.pMvd[i][0]) {
-      WelsFree (pCtx->sMb.pMvd[i][0], "pCtx->sMb.pMvd[][]");
+      pMa->WelsFree (pCtx->sMb.pMvd[i][0], "pCtx->sMb.pMvd[][]");
       pCtx->sMb.pMvd[i][0] = NULL;
     }
 
     if (pCtx->sMb.pCbfDc[i]) {
-      WelsFree (pCtx->sMb.pCbfDc[i], "pCtx->sMb.pCbfDc[]");
+      pMa->WelsFree (pCtx->sMb.pCbfDc[i], "pCtx->sMb.pCbfDc[]");
       pCtx->sMb.pCbfDc[i] = NULL;
     }
 
     if (pCtx->sMb.pNzc[i]) {
-      WelsFree (pCtx->sMb.pNzc[i], "pCtx->sMb.pNzc[]");
+      pMa->WelsFree (pCtx->sMb.pNzc[i], "pCtx->sMb.pNzc[]");
 
       pCtx->sMb.pNzc[i] = NULL;
     }
 
     if (pCtx->sMb.pNzcRs[i]) {
-      WelsFree (pCtx->sMb.pNzcRs[i], "pCtx->sMb.pNzcRs[]");
+      pMa->WelsFree (pCtx->sMb.pNzcRs[i], "pCtx->sMb.pNzcRs[]");
 
       pCtx->sMb.pNzcRs[i] = NULL;
     }
 
     if (pCtx->sMb.pScaledTCoeff[i]) {
-      WelsFree (pCtx->sMb.pScaledTCoeff[i], "pCtx->sMb.pScaledTCoeff[]");
+      pMa->WelsFree (pCtx->sMb.pScaledTCoeff[i], "pCtx->sMb.pScaledTCoeff[]");
 
       pCtx->sMb.pScaledTCoeff[i] = NULL;
     }
 
     if (pCtx->sMb.pIntraPredMode[i]) {
-      WelsFree (pCtx->sMb.pIntraPredMode[i], "pCtx->sMb.pIntraPredMode[]");
+      pMa->WelsFree (pCtx->sMb.pIntraPredMode[i], "pCtx->sMb.pIntraPredMode[]");
 
       pCtx->sMb.pIntraPredMode[i] = NULL;
     }
 
     if (pCtx->sMb.pIntra4x4FinalMode[i]) {
-      WelsFree (pCtx->sMb.pIntra4x4FinalMode[i], "pCtx->sMb.pIntra4x4FinalMode[]");
+      pMa->WelsFree (pCtx->sMb.pIntra4x4FinalMode[i], "pCtx->sMb.pIntra4x4FinalMode[]");
 
       pCtx->sMb.pIntra4x4FinalMode[i] = NULL;
     }
 
     if (pCtx->sMb.pIntraNxNAvailFlag[i]) {
-      WelsFree (pCtx->sMb.pIntraNxNAvailFlag[i], "pCtx->sMb.pIntraNxNAvailFlag");
+      pMa->WelsFree (pCtx->sMb.pIntraNxNAvailFlag[i], "pCtx->sMb.pIntraNxNAvailFlag");
 
       pCtx->sMb.pIntraNxNAvailFlag[i] = NULL;
     }
 
     if (pCtx->sMb.pChromaPredMode[i]) {
-      WelsFree (pCtx->sMb.pChromaPredMode[i], "pCtx->sMb.pChromaPredMode[]");
+      pMa->WelsFree (pCtx->sMb.pChromaPredMode[i], "pCtx->sMb.pChromaPredMode[]");
 
       pCtx->sMb.pChromaPredMode[i] = NULL;
     }
 
     if (pCtx->sMb.pCbp[i]) {
-      WelsFree (pCtx->sMb.pCbp[i], "pCtx->sMb.pCbp[]");
+      pMa->WelsFree (pCtx->sMb.pCbp[i], "pCtx->sMb.pCbp[]");
 
       pCtx->sMb.pCbp[i] = NULL;
     }
@@ -1432,45 +1447,45 @@
 
     //      if (pCtx->sMb.pMotionPredFlag[i])
     //{
-    //  WelsFree( pCtx->sMb.pMotionPredFlag[i], "pCtx->sMb.pMotionPredFlag[]" );
+    //  pMa->WelsFree( pCtx->sMb.pMotionPredFlag[i], "pCtx->sMb.pMotionPredFlag[]" );
 
     //  pCtx->sMb.pMotionPredFlag[i] = NULL;
     //}
 
     if (pCtx->sMb.pSubMbType[i]) {
-      WelsFree (pCtx->sMb.pSubMbType[i], "pCtx->sMb.pSubMbType[]");
+      pMa->WelsFree (pCtx->sMb.pSubMbType[i], "pCtx->sMb.pSubMbType[]");
 
       pCtx->sMb.pSubMbType[i] = NULL;
     }
 
     if (pCtx->sMb.pSliceIdc[i]) {
-      WelsFree (pCtx->sMb.pSliceIdc[i], "pCtx->sMb.pSliceIdc[]");
+      pMa->WelsFree (pCtx->sMb.pSliceIdc[i], "pCtx->sMb.pSliceIdc[]");
 
       pCtx->sMb.pSliceIdc[i] = NULL;
     }
 
     if (pCtx->sMb.pResidualPredFlag[i]) {
-      WelsFree (pCtx->sMb.pResidualPredFlag[i], "pCtx->sMb.pResidualPredFlag[]");
+      pMa->WelsFree (pCtx->sMb.pResidualPredFlag[i], "pCtx->sMb.pResidualPredFlag[]");
 
       pCtx->sMb.pResidualPredFlag[i] = NULL;
     }
 
     if (pCtx->sMb.pInterPredictionDoneFlag[i]) {
-      WelsFree (pCtx->sMb.pInterPredictionDoneFlag[i], "pCtx->sMb.pInterPredictionDoneFlag[]");
+      pMa->WelsFree (pCtx->sMb.pInterPredictionDoneFlag[i], "pCtx->sMb.pInterPredictionDoneFlag[]");
 
       pCtx->sMb.pInterPredictionDoneFlag[i] = NULL;
     }
 
     if (pCtx->sMb.pMbCorrectlyDecodedFlag[i]) {
-      WelsFree (pCtx->sMb.pMbCorrectlyDecodedFlag[i], "pCtx->sMb.pMbCorrectlyDecodedFlag[]");
+      pMa->WelsFree (pCtx->sMb.pMbCorrectlyDecodedFlag[i], "pCtx->sMb.pMbCorrectlyDecodedFlag[]");
       pCtx->sMb.pMbCorrectlyDecodedFlag[i] = NULL;
     }
 
     if (pCtx->sMb.pMbRefConcealedFlag[i]) {
-      WelsFree (pCtx->sMb.pMbRefConcealedFlag[i], "pCtx->sMb.pMbRefConcealedFlag[]");
+      pMa->WelsFree (pCtx->sMb.pMbRefConcealedFlag[i], "pCtx->sMb.pMbRefConcealedFlag[]");
       pCtx->sMb.pMbRefConcealedFlag[i] = NULL;
     }
-    WelsFree (pDq, "pDq");
+    pMa->WelsFree (pDq, "pDq");
 
     pDq = NULL;
     pCtx->pDqLayersList[i] = NULL;
@@ -2184,7 +2199,7 @@
       pLayerInfo.pSubsetSps = pShExt->pSubsetSps;
 
       pCtx->pFmo = &pCtx->sFmoList[iPpsId];
-      if (!FmoParamUpdate (pCtx->pFmo, pLayerInfo.pSps, pLayerInfo.pPps, &pCtx->iActiveFmoNum)) {
+      if (!FmoParamUpdate (pCtx->pFmo, pLayerInfo.pSps, pLayerInfo.pPps, &pCtx->iActiveFmoNum, pCtx->pMemAlign)) {
         pCtx->iErrorCode |= dsBitstreamError;
         WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "DecodeCurrentAccessUnit(), FmoParamUpdate failed, eSliceType: %d.",
                  pSh->eSliceType);
@@ -2192,7 +2207,7 @@
       }
 
       bFreshSliceAvailable = (iCurrIdD != iLastIdD
-                               || iCurrIdQ != iLastIdQ);        // do not need condition of (first_mb == 0) due multiple slices might be disorder
+                              || iCurrIdQ != iLastIdQ);        // do not need condition of (first_mb == 0) due multiple slices might be disorder
 
       WelsDqLayerDecodeStart (pCtx, pNalCur, pLayerInfo.pSps, pLayerInfo.pPps);
 
--- a/codec/decoder/core/src/fmo.cpp
+++ b/codec/decoder/core/src/fmo.cpp
@@ -116,7 +116,7 @@
  * \return  0 - successful; none 0 - failed
  */
 static inline int32_t FmoGenerateSliceGroup (PFmo pFmo, const PPps kpPps, const int32_t kiMbWidth,
-    const int32_t kiMbHeight) {
+    const int32_t kiMbHeight, CMemoryAlign* pMa) {
   int32_t iNumMb = 0;
   int32_t iErr   = 0;
   bool bResolutionChanged = false;
@@ -131,9 +131,8 @@
   if (0 == iNumMb)
     return 1;
 
-
-  WelsFree (pFmo->pMbAllocMap, "_fmo->pMbAllocMap");
-  pFmo->pMbAllocMap = (uint8_t*)WelsMallocz (iNumMb * sizeof (uint8_t), "_fmo->pMbAllocMap");
+  pMa->WelsFree (pFmo->pMbAllocMap, "_fmo->pMbAllocMap");
+  pFmo->pMbAllocMap = (uint8_t*)pMa->WelsMallocz (iNumMb * sizeof (uint8_t), "_fmo->pMbAllocMap");
   WELS_VERIFY_RETURN_IF (1, (NULL == pFmo->pMbAllocMap)) // out of memory
 
   pFmo->iCountMbNum = iNumMb;
@@ -186,8 +185,8 @@
  *
  * \return  0 - successful; none 0 - failed;
  */
-int32_t InitFmo (PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight) {
-  return FmoGenerateSliceGroup (pFmo, pPps, kiMbWidth, kiMbHeight);
+int32_t InitFmo (PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight, CMemoryAlign* pMa) {
+  return FmoGenerateSliceGroup (pFmo, pPps, kiMbWidth, kiMbHeight, pMa);
 }
 
 
@@ -200,7 +199,7 @@
  *
  * \return  NONE
  */
-void UninitFmoList (PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail) {
+void UninitFmoList (PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail, CMemoryAlign* pMa) {
   PFmo pIter = pFmo;
   int32_t i = 0;
   int32_t iFreeNodes = 0;
@@ -211,7 +210,7 @@
   while (i < kiCnt) {
     if (pIter != NULL && pIter->bActiveFlag) {
       if (NULL != pIter->pMbAllocMap) {
-        WelsFree (pIter->pMbAllocMap, "pIter->pMbAllocMap");
+        pMa->WelsFree (pIter->pMbAllocMap, "pIter->pMbAllocMap");
 
         pIter->pMbAllocMap = NULL;
       }
@@ -258,7 +257,7 @@
  *
  * \return  true - update/insert successfully; false - failed;
  */
-bool FmoParamUpdate (PFmo pFmo, PSps pSps, PPps pPps, int32_t* pActiveFmoNum) {
+bool FmoParamUpdate (PFmo pFmo, PSps pSps, PPps pPps, int32_t* pActiveFmoNum, CMemoryAlign* pMa) {
   const uint32_t kuiMbWidth = pSps->iMbWidth;
   const uint32_t kuiMbHeight = pSps->iMbHeight;
 
@@ -267,7 +266,7 @@
                            pPps->uiSliceGroupMapType,
                            pPps->uiNumSliceGroups)) {
 
-    if (InitFmo (pFmo, pPps, kuiMbWidth, kuiMbHeight)) {
+    if (InitFmo (pFmo, pPps, kuiMbWidth, kuiMbHeight, pMa)) {
       return false;
     } else {
       if (!pFmo->bActiveFlag && *pActiveFmoNum < MAX_PPS_COUNT) {
--- a/codec/decoder/core/src/memmgr_nal_unit.cpp
+++ b/codec/decoder/core/src/memmgr_nal_unit.cpp
@@ -43,7 +43,7 @@
 
 namespace WelsDec {
 
-int32_t MemInitNalList (PAccessUnit* ppAu, const uint32_t kuiSize) {
+int32_t MemInitNalList (PAccessUnit* ppAu, const uint32_t kuiSize, CMemoryAlign* pMa) {
   uint32_t uiIdx = 0;
   uint8_t* pBase = NULL, *pPtr = NULL;
   const uint32_t kuiSizeAu = sizeof (SAccessUnit);
@@ -55,10 +55,10 @@
     return 1;
 
   if (*ppAu != NULL) {
-    MemFreeNalList (ppAu);
+    MemFreeNalList (ppAu, pMa);
   }
 
-  pBase = (uint8_t*)WelsMallocz (kuiCountSize, "Access Unit");
+  pBase = (uint8_t*)pMa->WelsMallocz (kuiCountSize, "Access Unit");
   if (pBase == NULL)
     return 1;
   pPtr = pBase;
@@ -82,11 +82,11 @@
   return 0;
 }
 
-int32_t MemFreeNalList (PAccessUnit* ppAu) {
+int32_t MemFreeNalList (PAccessUnit* ppAu, CMemoryAlign* pMa) {
   if (ppAu != NULL) {
     PAccessUnit pAu = *ppAu;
     if (pAu != NULL) {
-      WelsFree (pAu, "Access Unit");
+      pMa->WelsFree (pAu, "Access Unit");
       *ppAu = NULL;
     }
   }
@@ -94,7 +94,7 @@
 }
 
 
-int32_t ExpandNalUnitList (PAccessUnit* ppAu, const int32_t kiOrgSize, const int32_t kiExpSize) {
+int32_t ExpandNalUnitList (PAccessUnit* ppAu, const int32_t kiOrgSize, const int32_t kiExpSize, CMemoryAlign* pMa) {
   if (kiExpSize <= kiOrgSize)
     return 1;
   else {
@@ -101,7 +101,7 @@
     PAccessUnit pTmp = NULL;
     int32_t iIdx = 0;
 
-    if (MemInitNalList (&pTmp, kiExpSize)) // request new list with expanding
+    if (MemInitNalList (&pTmp, kiExpSize, pMa)) // request new list with expanding
       return 1;
 
     do {
@@ -115,7 +115,7 @@
     pTmp->uiEndPos              = (*ppAu)->uiEndPos;
     pTmp->bCompletedAuFlag      = (*ppAu)->bCompletedAuFlag;
 
-    MemFreeNalList (ppAu); // free old list
+    MemFreeNalList (ppAu, pMa); // free old list
     *ppAu = pTmp;
     return 0;
   }
@@ -126,13 +126,13 @@
  *  Get next NAL Unit for using.
  *  Need expand NAL Unit list if exceeding count number of available NAL Units withing an Access Unit
  */
-PNalUnit MemGetNextNal (PAccessUnit* ppAu) {
+PNalUnit MemGetNextNal (PAccessUnit* ppAu, CMemoryAlign* pMa) {
   PAccessUnit pAu = *ppAu;
   PNalUnit pNu = NULL;
 
   if (pAu->uiAvailUnitsNum >= pAu->uiCountUnitsNum) { // need expand list
     const uint32_t kuiExpandingSize = pAu->uiCountUnitsNum + (MAX_NAL_UNIT_NUM_IN_AU >> 1);
-    if (ExpandNalUnitList (ppAu, pAu->uiCountUnitsNum, kuiExpandingSize))
+    if (ExpandNalUnitList (ppAu, pAu->uiCountUnitsNum, kuiExpandingSize, pMa))
       return NULL; // out of memory
     pAu = *ppAu;
   }
--- a/codec/decoder/core/src/pic_queue.cpp
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -44,7 +44,7 @@
 
 namespace WelsDec {
 
-void FreePicture (PPicture pPic);
+void FreePicture (PPicture pPic, CMemoryAlign* pMa);
 
 
 ///////////////////////////////////Recycled queue management for pictures///////////////////////////////////
@@ -68,8 +68,9 @@
   int32_t iPicChromaHeight  = 0;
   int32_t iLumaSize         = 0;
   int32_t iChromaSize       = 0;
+  CMemoryAlign* pMa = pCtx->pMemAlign;
 
-  pPic = (PPicture) WelsMallocz (sizeof (SPicture), "PPicture");
+  pPic = (PPicture) pMa->WelsMallocz (sizeof (SPicture), "PPicture");
   WELS_VERIFY_RETURN_IF (NULL, NULL == pPic);
 
   memset (pPic, 0, sizeof (SPicture));
@@ -88,9 +89,9 @@
     pPic->iLinesize[0] = iPicWidth;
     pPic->iLinesize[1] = pPic->iLinesize[2] = iPicChromaWidth;
   } else {
-    pPic->pBuffer[0] = static_cast<uint8_t*> (WelsMallocz (iLumaSize /* luma */
-                        + (iChromaSize << 1) /* Cb,Cr */, "_pic->buffer[0]"));
-    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->pBuffer[0], FreePicture (pPic));
+    pPic->pBuffer[0] = static_cast<uint8_t*> (pMa->WelsMallocz (iLumaSize /* luma */
+                       + (iChromaSize << 1) /* Cb,Cr */, "_pic->buffer[0]"));
+    WELS_VERIFY_RETURN_PROC_IF (NULL, NULL == pPic->pBuffer[0], FreePicture (pPic, pMa));
 
     memset (pPic->pBuffer[0], 128, (iLumaSize + (iChromaSize << 1)));
     pPic->iLinesize[0] = iPicWidth;
@@ -110,14 +111,14 @@
   return pPic;
 }
 
-void FreePicture (PPicture pPic) {
+void FreePicture (PPicture pPic, CMemoryAlign* pMa) {
   if (NULL != pPic) {
 
     if (pPic->pBuffer[0]) {
-      WelsFree (pPic->pBuffer[0], "pPic->pBuffer[0]");
+      pMa->WelsFree (pPic->pBuffer[0], "pPic->pBuffer[0]");
     }
 
-    WelsFree (pPic, "pPic");
+    pMa->WelsFree (pPic, "pPic");
 
     pPic = NULL;
   }
--- a/codec/decoder/plus/src/welsDecoderExt.cpp
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -219,11 +219,19 @@
   if (NULL == m_pDecContext)
     return;
 
-  WelsLog (&m_pWelsTrace->m_sLogCtx, WELS_LOG_INFO, "CWelsDecoder::uninit_decoder(), openh264 codec version = %s.",
+  WelsLog (&m_pWelsTrace->m_sLogCtx, WELS_LOG_INFO, "CWelsDecoder::UninitDecoder(), openh264 codec version = %s.",
            VERSION_NUMBER);
 
   WelsEndDecoder (m_pDecContext);
 
+  if (m_pDecContext->pMemAlign != NULL) {
+    WelsLog (&m_pWelsTrace->m_sLogCtx, WELS_LOG_INFO,
+             "CWelsDecoder::UninitDecoder(), verify memory usage (%d bytes) after free..",
+             m_pDecContext->pMemAlign->WelsGetMemoryUsage());
+    delete m_pDecContext->pMemAlign;
+    m_pDecContext->pMemAlign = NULL;
+  }
+
   if (NULL != m_pDecContext) {
     WelsFree (m_pDecContext, "m_pDecContext");
 
@@ -243,6 +251,9 @@
   m_pDecContext = (PWelsDecoderContext)WelsMallocz (sizeof (SWelsDecoderContext), "m_pDecContext");
   if (NULL == m_pDecContext)
     return cmMallocMemeError;
+  int32_t iCacheLineSize = 16;   // on chip cache line size in byte
+  m_pDecContext->pMemAlign = new CMemoryAlign (iCacheLineSize);
+  WELS_VERIFY_RETURN_PROC_IF (1, (NULL == m_pDecContext->pMemAlign), UninitDecoder())
 
   return WelsInitDecoder (m_pDecContext, bParseOnly, &m_pWelsTrace->m_sLogCtx);
 }
@@ -409,10 +420,10 @@
   //ppTmpDst[2] = ppDst[2];
   iRet |= DecodeFrame2 (NULL, 0, ppDst, pDstInfo);
   //if ((pDstInfo->iBufferStatus == 0) && (sTmpBufferInfo.iBufferStatus == 1)) {
-    //memcpy (pDstInfo, &sTmpBufferInfo, sizeof (SBufferInfo));
-    //ppDst[0] = ppTmpDst[0];
-    //ppDst[1] = ppTmpDst[1];
-    //ppDst[2] = ppTmpDst[2];
+  //memcpy (pDstInfo, &sTmpBufferInfo, sizeof (SBufferInfo));
+  //ppDst[0] = ppTmpDst[0];
+  //ppDst[1] = ppTmpDst[1];
+  //ppDst[2] = ppTmpDst[2];
   //}
 
   return (DECODING_STATE) iRet;
--- a/test/decoder/DecUT_ParseSyntax.cpp
+++ b/test/decoder/DecUT_ParseSyntax.cpp
@@ -65,6 +65,11 @@
   if (NULL == pCtx)
     return cmMallocMemeError;
 
+  if (NULL == pCtx->pMemAlign) {
+    pCtx->pMemAlign = new CMemoryAlign (16);
+    if (NULL == pCtx->pMemAlign)
+      return cmMallocMemeError;
+  }
 
   return WelsInitDecoder (pCtx, bParseOnly, pLogCtx);
 }
@@ -92,6 +97,10 @@
     return;
 
   WelsEndDecoder (pCtx);
+  if (NULL != pCtx->pMemAlign) {
+    delete pCtx->pMemAlign;
+    pCtx->pMemAlign = NULL;
+  }
   if (NULL != pCtx) {
     free (pCtx);
     pCtx = NULL;
@@ -160,6 +169,7 @@
   //
   m_pCtx = (PWelsDecoderContext)malloc (sizeof (SWelsDecoderContext));
 
+  memset (m_pCtx, 0, sizeof (SWelsDecoderContext));
   m_pWelsTrace = new welsCodecTrace();
   if (m_pWelsTrace != NULL) {
     m_pWelsTrace->SetTraceLevel (WELS_LOG_ERROR);