shithub: openh264

Download patch

ref: fd42f04c10de963c754793e30971d407954d0d45
parent: 2f5ecec958a91df3eaa14e7be590a8eb800cb28f
author: xiaotiansf <xiaotianshimail@gmail.com>
date: Wed Oct 16 05:19:16 EDT 2019

second commit for threaded decoding support:
Add function WelsDecodeAndConstructSlice (and add associted functions) which combines decoding->reconstructing->deblocking in single MB loop so that it allows multile frames to decode MBs in parallel.

--- a/codec/common/inc/expand_pic.h
+++ b/codec/common/inc/expand_pic.h
@@ -47,6 +47,7 @@
 #endif//__cplusplus
 
 #define PADDING_LENGTH 32 // reference extension
+#define CHROMA_PADDING_LENGTH 16 // chroma reference extension
 
 #if defined(X86_ASM)
 void ExpandPictureLuma_sse2 (uint8_t* pDst,
@@ -89,6 +90,10 @@
   PExpandPictureFunc pfExpandChromaPicture[2];
 } SExpandPicFunc;
 
+void PadMBLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW, const int32_t& kiPicH,
+                  const int32_t& kiMbX, const int32_t& kiMbY, const int32_t& kiMBWidth, const int32_t& kiMBHeight);
+void PadMBChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW, const int32_t& kiPicH,
+                    const int32_t& kiMbX, const int32_t& kiMbY, const int32_t& kiMBWidth, const int32_t& kiMBHeight);
 
 void ExpandReferencingPicture (uint8_t* pData[3], int32_t iWidth, int32_t iHeight, int32_t iStride[3],
                                PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2]);
--- a/codec/common/src/expand_pic.cpp
+++ b/codec/common/src/expand_pic.cpp
@@ -33,6 +33,240 @@
 #include "expand_pic.h"
 #include "cpu_core.h"
 
+static inline void MBPadTopLeftLuma_c (uint8_t*& pDst, const int32_t& kiStride) {
+  const uint8_t kuiTL = pDst[0];
+  int32_t i = 0;
+  uint8_t* pTopLeft = pDst;
+  do {
+    pTopLeft -= kiStride;
+    // pad pTop
+    memcpy (pTopLeft, pDst, 16);           // confirmed_safe_unsafe_usage
+    memset (pTopLeft - PADDING_LENGTH, kuiTL, PADDING_LENGTH); //pTop left
+  } while (++i < PADDING_LENGTH);
+}
+
+static inline void MBPadTopLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbX) {
+  uint8_t* pTopLine = pDst + (kiMbX << 4);
+  int32_t i = 0;
+  uint8_t* pTop = pTopLine;
+  do {
+    pTop -= kiStride;
+    // pad pTop
+    memcpy (pTop, pTopLine, 16);          // confirmed_safe_unsafe_usage
+  } while (++i < PADDING_LENGTH);
+}
+
+static inline void MBPadBottomLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbX,
+                                      const int32_t& kiPicH) {
+  uint8_t* pBottomLine = pDst + (kiPicH - 1) * kiStride + (kiMbX << 4);
+  int32_t i = 0;
+  uint8_t* pBottom = pBottomLine;
+  do {
+    pBottom += kiStride;
+    // pad pBottom
+    memcpy (pBottom, pBottomLine, 16);       // confirmed_safe_unsafe_usage
+  } while (++i < PADDING_LENGTH);
+}
+
+static inline void MBPadTopRightLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW) {
+  uint8_t* pTopRight = pDst + kiPicW;
+  const uint8_t kuiTR = pTopRight[-1];
+  int32_t i = 0;
+  uint8_t* pTop = pTopRight;
+  do {
+    pTop -= kiStride;
+    // pad pTop
+    memcpy (pTop - 16, pTopRight - 16, 16);          // confirmed_safe_unsafe_usage
+    memset (pTop, kuiTR, PADDING_LENGTH); //pTop Right
+  } while (++i < PADDING_LENGTH);
+}
+
+static inline void MBPadBottomLeftLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicH) {
+  uint8_t* pDstLastLine = pDst + (kiPicH - 1) * kiStride;
+  const uint8_t kuiBL = pDstLastLine[0];
+  int32_t i = 0;
+  uint8_t* pBottom = pDstLastLine;
+  do {
+    pBottom += kiStride;
+    // pad pBottom
+    memcpy (pBottom, pDstLastLine, 16);          // confirmed_safe_unsafe_usage
+    memset (pBottom - PADDING_LENGTH, kuiBL, PADDING_LENGTH); //pBottom left
+  } while (++i < PADDING_LENGTH);
+}
+
+static inline void MBPadBottomRightLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW,
+    const int32_t& kiPicH) {
+  uint8_t* pDstLastLine = pDst + (kiPicH - 1) * kiStride + kiPicW;
+  const uint8_t kuiBR = pDstLastLine[-1];
+  int32_t i = 0;
+  uint8_t* pBottom = pDstLastLine;
+  do {
+    pBottom += kiStride;
+    // pad pBottom
+    memcpy (pBottom - 16, pDstLastLine - 16, 16);         // confirmed_safe_unsafe_usage
+    memset (pBottom, kuiBR, PADDING_LENGTH); //pBottom Right
+  } while (++i < PADDING_LENGTH);
+}
+
+static inline void MBPadLeftLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbY) {
+  uint8_t* pTmp = pDst + (kiMbY << 4) * kiStride;
+  for (int32_t i = 0; i < 16; ++i) {
+    // pad left
+    memset (pTmp - PADDING_LENGTH, pTmp[0], PADDING_LENGTH);
+    pTmp += kiStride;
+  }
+}
+
+static inline void MBPadRightLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbY,
+                                     const int32_t& kiPicW) {
+  uint8_t* pTmp = pDst + (kiMbY << 4) * kiStride + kiPicW;
+  for (int32_t i = 0; i < 16; ++i) {
+    // pad right
+    memset (pTmp, pTmp[-1], PADDING_LENGTH);
+    pTmp += kiStride;
+  }
+}
+
+static inline void MBPadTopChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbX) {
+  uint8_t* pTopLine = pDst + (kiMbX << 3);
+  int32_t i = 0;
+  uint8_t* pTop = pTopLine;
+  do {
+    pTop -= kiStride;
+    // pad pTop
+    memcpy (pTop, pTopLine, 8);         // confirmed_safe_unsafe_usage
+  } while (++i < CHROMA_PADDING_LENGTH);
+}
+
+static inline void MBPadBottomChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbX,
+                                        const int32_t& kiPicH) {
+  uint8_t* pBottomLine = pDst + (kiPicH - 1) * kiStride + (kiMbX << 3);
+  int32_t i = 0;
+  uint8_t* pBottom = pBottomLine;
+  do {
+    pBottom += kiStride;
+    // pad pBottom
+    memcpy (pBottom, pBottomLine, 8);        // confirmed_safe_unsafe_usage
+  } while (++i < CHROMA_PADDING_LENGTH);
+}
+
+static inline void MBPadTopLeftChroma_c (uint8_t*& pDst, const int32_t& kiStride) {
+  const uint8_t kuiTL = pDst[0];
+  int32_t i = 0;
+  uint8_t* pTopLeft = pDst;
+  do {
+    pTopLeft -= kiStride;
+    // pad pTop
+    memcpy (pTopLeft, pDst, 8);          // confirmed_safe_unsafe_usage
+    memset (pTopLeft - CHROMA_PADDING_LENGTH, kuiTL, CHROMA_PADDING_LENGTH); //pTop left
+  } while (++i < CHROMA_PADDING_LENGTH);
+}
+
+static inline void MBPadTopRightChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW) {
+  uint8_t* pTopRight = pDst + kiPicW;
+  const uint8_t kuiTR = pTopRight[-1];
+  int32_t i = 0;
+  uint8_t* pTop = pTopRight;
+  do {
+    pTop -= kiStride;
+    // pad pTop
+    memcpy (pTop - 8, pTopRight - 8, 8);         // confirmed_safe_unsafe_usage
+    memset (pTop, kuiTR, CHROMA_PADDING_LENGTH); //pTop Right
+  } while (++i < CHROMA_PADDING_LENGTH);
+}
+
+static inline void MBPadBottomLeftChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicH) {
+  uint8_t* pDstLastLine = pDst + (kiPicH - 1) * kiStride;
+  const uint8_t kuiBL = pDstLastLine[0];
+  int32_t i = 0;
+  uint8_t* pBottom = pDstLastLine;
+  do {
+    pBottom += kiStride;
+    // pad pBottom
+    memcpy (pBottom, pDstLastLine, 8);         // confirmed_safe_unsafe_usage
+    memset (pBottom - CHROMA_PADDING_LENGTH, kuiBL, CHROMA_PADDING_LENGTH); //pBottom left
+  } while (++i < CHROMA_PADDING_LENGTH);
+}
+
+static inline void MBPadBottomRightChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW,
+    const int32_t kiPicH) {
+  uint8_t* pDstLastLine = pDst + (kiPicH - 1) * kiStride + kiPicW;
+  const uint8_t kuiBR = pDstLastLine[-1];
+  int32_t i = 0;
+  uint8_t* pBottom = pDstLastLine;
+  do {
+    pBottom += kiStride;
+    // pad pBottom
+    memcpy (pBottom - 8, pDstLastLine - 8, 8);       // confirmed_safe_unsafe_usage
+    memset (pBottom, kuiBR, CHROMA_PADDING_LENGTH); //pBottom Right
+  } while (++i < CHROMA_PADDING_LENGTH);
+}
+
+static inline void MBPadLeftChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbY) {
+  uint8_t* pTmp = pDst + (kiMbY << 3) * kiStride;
+  for (int32_t i = 0; i < 8; ++i) {
+    // pad left
+    memset (pTmp - CHROMA_PADDING_LENGTH, pTmp[0], CHROMA_PADDING_LENGTH);
+    pTmp += kiStride;
+  }
+}
+
+static inline void MBPadRightChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiMbY,
+                                       const int32_t& kiPicW) {
+  uint8_t* pTmp = pDst + (kiMbY << 3) * kiStride + kiPicW;
+  for (int32_t i = 0; i < 8; ++i) {
+    // pad right
+    memset (pTmp, pTmp[-1], CHROMA_PADDING_LENGTH);
+    pTmp += kiStride;
+  }
+}
+
+void PadMBLuma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW, const int32_t& kiPicH,
+                  const int32_t& kiMbX, const int32_t& kiMbY, const int32_t& kiMBWidth, const int32_t& kiMBHeight) {
+  if (kiMbX == 0 && kiMbY == 0) {
+    MBPadTopLeftLuma_c (pDst, kiStride);
+  } else if (kiMbY == 0 && kiMbX == kiMBWidth - 1) {
+    MBPadTopRightLuma_c (pDst, kiStride, kiPicW);
+  } else if (kiMbY == kiMBHeight - 1 && kiMbX == 0) {
+    MBPadBottomLeftLuma_c (pDst, kiStride, kiPicH);
+  } else if (kiMbY == kiMBHeight - 1 && kiMbX == kiMBWidth - 1) {
+    MBPadBottomRightLuma_c (pDst, kiStride, kiPicW, kiPicH);
+  }
+  if (kiMbX == 0) {
+    MBPadLeftLuma_c (pDst, kiStride, kiMbY);
+  } else if (kiMbX == kiMBWidth - 1) {
+    MBPadRightLuma_c (pDst, kiStride, kiMbY, kiPicW);
+  }
+  if (kiMbY == 0 && kiMbX > 0 && kiMbX < kiMBWidth - 1) {
+    MBPadTopLuma_c (pDst, kiStride, kiMbX);
+  } else if (kiMbY == kiMBHeight - 1 && kiMbX > 0 && kiMbX < kiMBWidth - 1) {
+    MBPadBottomLuma_c (pDst, kiStride, kiMbX, kiPicH);
+  }
+}
+
+void PadMBChroma_c (uint8_t*& pDst, const int32_t& kiStride, const int32_t& kiPicW, const int32_t& kiPicH,
+                    const int32_t& kiMbX, const int32_t& kiMbY, const int32_t& kiMBWidth, const int32_t& kiMBHeight) {
+  if (kiMbX == 0 && kiMbY == 0) {
+    MBPadTopLeftChroma_c (pDst, kiStride);
+  } else if (kiMbY == 0 && kiMbX == kiMBWidth - 1) {
+    MBPadTopRightChroma_c (pDst, kiStride, kiPicW);
+  } else if (kiMbY == kiMBHeight - 1 && kiMbX == 0) {
+    MBPadBottomLeftChroma_c (pDst, kiStride, kiPicH);
+  } else if (kiMbY == kiMBHeight - 1 && kiMbX == kiMBWidth - 1) {
+    MBPadBottomRightChroma_c (pDst, kiStride, kiPicW, kiPicH);
+  }
+  if (kiMbX == 0) {
+    MBPadLeftChroma_c (pDst, kiStride, kiMbY);
+  } else if (kiMbX == kiMBWidth - 1) {
+    MBPadRightChroma_c (pDst, kiStride, kiMbY, kiPicW);
+  }
+  if (kiMbY == 0 && kiMbX > 0 && kiMbX < kiMBWidth - 1) {
+    MBPadTopChroma_c (pDst, kiStride, kiMbX);
+  } else if (kiMbY == kiMBHeight - 1 && kiMbX > 0 && kiMbX < kiMBWidth - 1) {
+    MBPadBottomChroma_c (pDst, kiStride, kiMbX, kiPicH);
+  }
+}
+
 // rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
 static inline void ExpandPictureLuma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
                                         const int32_t kiPicH) {
--- a/codec/decoder/core/inc/deblocking.h
+++ b/codec/decoder/core/inc/deblocking.h
@@ -67,6 +67,26 @@
 void WelsDeblockingFilterSlice (PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb);
 
 /*!
+* \brief   AVC slice init deblocking filtering target layer
+*
+* \in and out param   SDeblockingFilter
+* \in and out param   iFilterIdc
+*
+* \return  NONE
+*/
+void WelsDeblockingInitFilter (PWelsDecoderContext pCtx, SDeblockingFilter& pFilter, int32_t& iFilterIdc);
+
+/*!
+* \brief   AVC MB deblocking filtering target layer
+*
+* \param   DqLayer which has the current location of MB to be deblocked.
+*
+* \return  NONE
+*/
+void WelsDeblockingFilterMB (PDqLayer pCurDqLayer, SDeblockingFilter& pFilter, int32_t& iFilterIdc,
+                             PDeblockingFilterMbFunc pDeblockMb);
+
+/*!
  * \brief   pixel deblocking filtering
  *
  * \param   filter                deblocking filter
@@ -77,7 +97,8 @@
  * \return  NONE
  */
 
-uint32_t DeblockingBsMarginalMBAvcbase (PDeblockingFilter  pFilter, PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy);
+uint32_t DeblockingBsMarginalMBAvcbase (PDeblockingFilter  pFilter, PDqLayer pCurDqLayer, int32_t iEdge,
+                                        int32_t iNeighMb, int32_t iMbXy);
 uint32_t DeblockingBSliceBsMarginalMBAvcbase (PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy);
 
 int32_t DeblockingAvailableNoInterlayer (PDqLayer pCurDqLayer, int32_t iFilterIdc);
--- a/codec/decoder/core/inc/decode_slice.h
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -58,6 +58,7 @@
 int32_t WelsTargetSliceConstruction (PWelsDecoderContext pCtx); //construction based on slice
 
 int32_t WelsDecodeSlice (PWelsDecoderContext pCtx, bool bFirstSliceInLayer, PNalUnit pNalCur);
+int32_t WelsDecodeAndConstructSlice (PWelsDecoderContext pCtx);
 
 int32_t WelsTargetMbConstruction (PWelsDecoderContext pCtx);
 
--- a/codec/decoder/core/inc/decoder.h
+++ b/codec/decoder/core/inc/decoder.h
@@ -69,6 +69,10 @@
 */
 void WelsDecoderSpsPpsDefaults (SWelsDecoderSpsPpsCTX& sSpsPpsCtx);
 
+/*!
+* \brief   copy SpsPps from one Ctx to another ctx for threaded code
+*/
+void CopySpsPps (PWelsDecoderContext pFromCtx, PWelsDecoderContext pToCtx);
 
 /*!
  *************************************************************************************
--- a/codec/decoder/core/inc/decoder_core.h
+++ b/codec/decoder/core/inc/decoder_core.h
@@ -129,6 +129,34 @@
  */
 bool PrefetchNalHeaderExtSyntax (PWelsDecoderContext pCtx, PNalUnit const kpDst, PNalUnit const kpSrc);
 
+/*
+* WelsDecodeInitAccessUnitStart
+* check and (re)allocate picture buffers on new sequence begin
+*  bit_len:    size in bit length of data
+*  buf_len:    size in byte length of data
+*  coded_au:   mark an Access Unit decoding finished
+* return:
+*  0 - success; otherwise returned error_no defined in error_no.h
+*/
+int32_t WelsDecodeInitAccessUnitStart (PWelsDecoderContext pCtx, SBufferInfo* pDstInfo);
+/*
+* AllocPicBuffOnNewSeqBegin
+* check and (re)allocate picture buffers on new sequence begin
+* return:
+*  0 - success; otherwise returned error_no defined in error_no.h
+*/
+int32_t AllocPicBuffOnNewSeqBegin (PWelsDecoderContext pCtx);
+
+/*
+* InitConstructAccessUnit
+* Init before constructing an access unit for given input bitstream, maybe partial NAL Unit, one or more Units are involved to
+* joint a collective access unit.
+* parameter\
+*  SBufferInfo:    Buffer info
+* return:
+*  0 - success; otherwise returned error_no defined in error_no.h
+*/
+int32_t InitConstructAccessUnit (PWelsDecoderContext pCtx, SBufferInfo* pDstInfo);
 
 /*
  * ConstructAccessUnit
--- a/codec/decoder/core/inc/manage_dec_ref.h
+++ b/codec/decoder/core/inc/manage_dec_ref.h
@@ -48,11 +48,12 @@
 namespace WelsDec {
 
 void  WelsResetRefPic (PWelsDecoderContext pCtx);
+void  WelsResetRefPicWithoutUnRef (PWelsDecoderContext pCtx);
 int32_t WelsInitRefList (PWelsDecoderContext pCtx, int32_t iPoc);
 int32_t WelsInitBSliceRefList (PWelsDecoderContext pCtx, int32_t iPoc);
 int32_t WelsReorderRefList (PWelsDecoderContext pCtx);
 int32_t WelsReorderRefList2 (PWelsDecoderContext pCtx);
-int32_t WelsMarkAsRef (PWelsDecoderContext pCtx);
+int32_t WelsMarkAsRef (PWelsDecoderContext pCtx, PPicture pLastDec = NULL);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/pic_queue.h
+++ b/codec/decoder/core/inc/pic_queue.h
@@ -53,6 +53,7 @@
  */
 
 PPicture PrefetchPic (PPicBuff pPicBuff);  // To get current node applicable
+PPicture PrefetchPicForThread (PPicBuff pPicBuff); // To get current node applicable in the case of threaded mode
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/picture.h
+++ b/codec/decoder/core/inc/picture.h
@@ -37,6 +37,7 @@
 #include "typedefs.h"
 #include "wels_common_defs.h"
 #include "wels_const_common.h"
+#include "wels_decoder_thread.h"
 
 using namespace WelsCommon;
 
@@ -96,6 +97,7 @@
   int16_t (*pMv[LIST_A])[MB_BLOCK4x4_NUM][MV_A]; // used for direct mode
   int8_t (*pRefIndex[LIST_A])[MB_BLOCK4x4_NUM]; //used for direct mode
   struct SPicture* pRefPic[LIST_A][17];  //ref pictures used for direct mode
+  SWelsDecEvent* pReadyEvent;  //MB line ready event
 
 };// "Picture" declaration is comflict with Mac system
 
--- a/codec/decoder/core/inc/rec_mb.h
+++ b/codec/decoder/core/inc/rec_mb.h
@@ -74,7 +74,8 @@
   int32_t iPicHeight;
 } sMCRefMember;
 
-void BaseMC (sMCRefMember* pMCRefMem, int32_t iXOffset, int32_t iYOffset, SMcFunc* pMCFunc,
+void BaseMC (PWelsDecoderContext pCtx, sMCRefMember* pMCRefMem, const int32_t& listIdx, const int8_t& iRefIdx,
+             int32_t iXOffset, int32_t iYOffset, SMcFunc* pMCFunc,
              int32_t iBlkWidth, int32_t iBlkHeight, int16_t iMVs[2]);
 
 void WelsFillRecNeededMbInfo (PWelsDecoderContext pCtx, bool bOutput, PDqLayer pCurDqLayer);
--- a/codec/decoder/core/src/au_parser.cpp
+++ b/codec/decoder/core/src/au_parser.cpp
@@ -608,7 +608,7 @@
         pCtx->iErrorCode |= dsBitstreamError;
       return iErr;
     }
-
+    pCtx->bHasNewSps = true;
     break;
 
   case NAL_UNIT_PPS:
@@ -628,11 +628,12 @@
         pCtx->iErrorCode |= dsNoParamSets;
       else
         pCtx->iErrorCode |= dsBitstreamError;
+      pCtx->bHasNewSps = false;
       return iErr;
     }
 
     pCtx->sSpsPpsCtx.bPpsExistAheadFlag = true;
-
+    ++ (pCtx->sSpsPpsCtx.iSeqId);
     break;
 
   case NAL_UNIT_SEI:
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -1377,6 +1377,55 @@
 }
 
 /*!
+* \brief   AVC slice init deblocking filtering target layer
+*
+* \in and out param   SDeblockingFilter
+* \in and out param   iFilterIdc
+*
+* \return  NONE
+*/
+void WelsDeblockingInitFilter (PWelsDecoderContext pCtx, SDeblockingFilter& pFilter, int32_t& iFilterIdc) {
+  PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+  PSliceHeaderExt pSliceHeaderExt = &pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
+
+  memset (&pFilter, 0, sizeof (pFilter));
+
+  iFilterIdc = pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc;
+
+  /* Step1: parameters set */
+  pFilter.pCsData[0] = pCtx->pDec->pData[0];
+  pFilter.pCsData[1] = pCtx->pDec->pData[1];
+  pFilter.pCsData[2] = pCtx->pDec->pData[2];
+
+  pFilter.iCsStride[0] = pCtx->pDec->iLinesize[0];
+  pFilter.iCsStride[1] = pCtx->pDec->iLinesize[1];
+
+  pFilter.eSliceType = (EWelsSliceType)pCurDqLayer->sLayerInfo.sSliceInLayer.eSliceType;
+
+  pFilter.iSliceAlphaC0Offset = pSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+  pFilter.iSliceBetaOffset = pSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+
+  pFilter.pLoopf = &pCtx->sDeblockingFunc;
+  pFilter.pRefPics[0] = pCtx->sRefPic.pRefList[0];
+  pFilter.pRefPics[1] = pCtx->sRefPic.pRefList[1];
+}
+
+/*!
+* \brief   AVC MB deblocking filtering target layer
+*
+* \param   DqLayer which has the current location of MB to be deblocked.
+*
+* \return  NONE
+*/
+void WelsDeblockingFilterMB (PDqLayer pCurDqLayer, SDeblockingFilter& pFilter, int32_t& iFilterIdc,
+                             PDeblockingFilterMbFunc pDeblockMb) {
+  /* macroblock deblocking */
+  if (0 == iFilterIdc || 2 == iFilterIdc) {
+    int32_t iBoundryFlag = DeblockingAvailableNoInterlayer (pCurDqLayer, iFilterIdc);
+    pDeblockMb (pCurDqLayer, &pFilter, iBoundryFlag);
+  }
+}
+/*!
  * \brief   deblocking module initialize
  *
  * \param   pf
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -93,7 +93,7 @@
 
   int32_t iTotalNumMb = pCurSlice->iTotalMbInCurSlice;
   int32_t iCountNumMb = 0;
-  PDeblockingFilterMbFunc pDeblockMb;
+  PDeblockingFilterMbFunc pDeblockMb = WelsDeblockingMb;
 
   if (!pCtx->sSpsPpsCtx.bAvcBasedFlag && iCurLayerWidth != pCtx->iCurSeqIntervalMaxPicWidth) {
     return ERR_INFO_WIDTH_MISMATCH;
@@ -163,8 +163,6 @@
   if (pCtx->pParam->bParseOnly) //for parse only, deblocking should not go on
     return ERR_NONE;
 
-  pDeblockMb = WelsDeblockingMb;
-
   if (1 == pSliceHeader->uiDisableDeblockingFilterIdc
       || pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer.iTotalMbInCurSlice <= 0) {
     return ERR_NONE;//NO_SUPPORTED_FILTER_IDX
@@ -238,8 +236,10 @@
   }
   WelsMbInterSampleConstruction (pCtx, pCurDqLayer, pDstY, pDstCb, pDstCr, iLumaStride, iChromaStride);
 
-  pCtx->sBlockFunc.pWelsSetNonZeroCountFunc (
-    pCurDqLayer->pNzc[pCurDqLayer->iMbXyIndex]); // set all none-zero nzc to 1; dbk can be opti!
+  if (pCtx->pThreadCtx == NULL) {
+    pCtx->sBlockFunc.pWelsSetNonZeroCountFunc (
+      pCurDqLayer->pNzc[pCurDqLayer->iMbXyIndex]); // set all none-zero nzc to 1; dbk can be opti!
+  }
   return ERR_NONE;
 }
 
@@ -723,7 +723,7 @@
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 ((pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i]), 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i]), 0, 51)];
     }
   }
 
@@ -738,7 +738,7 @@
     pSlice->iLastMbQp = pCurDqLayer->pLumaQp[iMbXy];
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 ((pSlice->iLastMbQp +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i]), 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i]), 0, 51)];
     }
     if (MB_TYPE_INTRA16x16 == pCurDqLayer->pDec->pMbType[iMbXy]) {
       //step1: Luma DC
@@ -973,7 +973,7 @@
     pSlice->iLastMbQp = pCurDqLayer->pLumaQp[iMbXy];
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pSlice->iLastMbQp +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
 
     if (MB_TYPE_INTRA16x16 == pCurDqLayer->pDec->pMbType[iMbXy]) {
@@ -1080,7 +1080,7 @@
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
   }
 
@@ -1214,7 +1214,7 @@
     pSlice->iLastMbQp = pCurDqLayer->pLumaQp[iMbXy];
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pSlice->iLastMbQp +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
 
     if (MB_TYPE_INTRA16x16 == pCurDqLayer->pDec->pMbType[iMbXy]) {
@@ -1321,7 +1321,7 @@
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
   }
 
@@ -1365,7 +1365,9 @@
 
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[0][iMbXy], 0, sizeof (int8_t) * 16);
-    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[0] && ppRefPic[0]->bIsComplete);
+    bool bIsPending = pCtx->pThreadCtx != NULL;
+    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[0] && (ppRefPic[0]->bIsComplete
+                            || bIsPending));
     //predict mv
     PredPSkipMvFromNeighbor (pCurDqLayer, pMv);
     for (i = 0; i < 16; i++) {
@@ -1381,7 +1383,7 @@
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp; //??????????????? dqaunt of previous mb
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
 
     //for neighboring CABAC usage
@@ -1419,6 +1421,8 @@
 
   memset (pCurDqLayer->pDirect[iMbXy], 0, sizeof (int8_t) * 16);
 
+  bool bIsPending = pCtx->pThreadCtx != NULL;
+
   if (uiCode) {
     int16_t pMv[LIST_A][2] = { {0, 0}, { 0, 0 } };
     int8_t  ref[LIST_A] = { 0 };
@@ -1433,8 +1437,8 @@
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[LIST_0][iMbXy], 0, sizeof (int8_t) * 16);
     memset (pCurDqLayer->pDec->pRefIndex[LIST_1][iMbXy], 0, sizeof (int8_t) * 16);
-    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && ppRefPicL0[0]->bIsComplete)
-                            || ! (ppRefPicL1[0] && ppRefPicL1[0]->bIsComplete);
+    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && (ppRefPicL0[0]->bIsComplete
+                            || bIsPending)) || ! (ppRefPicL1[0] && (ppRefPicL1[0]->bIsComplete || bIsPending));
 
     if (pCtx->bMbRefConcealed) {
       SLogContext* pLogCtx = & (pCtx->sLogCtx);
@@ -1463,7 +1467,7 @@
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp; //??????????????? dqaunt of previous mb
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
 
     //for neighboring CABAC usage
@@ -1613,6 +1617,161 @@
   return ERR_NONE;
 }
 
+int32_t WelsDecodeAndConstructSlice (PWelsDecoderContext pCtx) {
+  PNalUnit pNalCur = pCtx->pNalCur;
+  PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+  PFmo pFmo = pCtx->pFmo;
+  int32_t iRet;
+  int32_t iNextMbXyIndex, iSliceIdc;
+
+  PSlice pSlice = &pCurDqLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeaderExt pSliceHeaderExt = &pSlice->sSliceHeaderExt;
+  PSliceHeader pSliceHeader = &pSliceHeaderExt->sSliceHeader;
+  int32_t iMbX, iMbY;
+  const int32_t kiCountNumMb = pSliceHeader->pSps->uiTotalMbCount; //need to be correct when fmo or multi slice
+  int32_t iTotalMbTargetLayer = kiCountNumMb;
+  uint32_t uiEosFlag = 0;
+  PWelsDecMbFunc pDecMbFunc;
+
+  pSlice->iTotalMbInCurSlice = 0; //initialize at the starting of slice decoding.
+
+  if (pCtx->pPps->bEntropyCodingModeFlag) {
+    if (pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag ||
+        pSlice->sSliceHeaderExt.bAdaptiveBaseModeFlag ||
+        pSlice->sSliceHeaderExt.bAdaptiveResidualPredFlag) {
+      WelsLog (& (pCtx->sLogCtx), WELS_LOG_ERROR,
+               "WelsDecodeSlice()::::ILP flag exist, not supported with CABAC enabled!");
+      pCtx->iErrorCode |= dsBitstreamError;
+      return dsBitstreamError;
+    }
+    if (P_SLICE == pSliceHeader->eSliceType)
+      pDecMbFunc = WelsDecodeMbCabacPSlice;
+    else if (B_SLICE == pSliceHeader->eSliceType)
+      pDecMbFunc = WelsDecodeMbCabacBSlice;
+    else //I_SLICE. B_SLICE is being supported
+      pDecMbFunc = WelsDecodeMbCabacISlice;
+  } else {
+    if (P_SLICE == pSliceHeader->eSliceType) {
+      pDecMbFunc = WelsDecodeMbCavlcPSlice;
+    } else if (B_SLICE == pSliceHeader->eSliceType) {
+      pDecMbFunc = WelsDecodeMbCavlcBSlice;
+    } else { //I_SLICE
+      pDecMbFunc = WelsDecodeMbCavlcISlice;
+    }
+  }
+
+  if (pSliceHeader->pPps->bConstainedIntraPredFlag) {
+    pCtx->pFillInfoCacheIntraNxNFunc = WelsFillCacheConstrain1IntraNxN;
+    pCtx->pMapNxNNeighToSampleFunc = WelsMapNxNNeighToSampleConstrain1;
+    pCtx->pMap16x16NeighToSampleFunc = WelsMap16x16NeighToSampleConstrain1;
+  } else {
+    pCtx->pFillInfoCacheIntraNxNFunc = WelsFillCacheConstrain0IntraNxN;
+    pCtx->pMapNxNNeighToSampleFunc = WelsMapNxNNeighToSampleNormal;
+    pCtx->pMap16x16NeighToSampleFunc = WelsMap16x16NeighToSampleNormal;
+  }
+
+  pCtx->eSliceType = pSliceHeader->eSliceType;
+  if (pCurDqLayer->sLayerInfo.pPps->bEntropyCodingModeFlag == 1) {
+    int32_t iQp = pSlice->sSliceHeaderExt.sSliceHeader.iSliceQp;
+    int32_t iCabacInitIdc = pSlice->sSliceHeaderExt.sSliceHeader.iCabacInitIdc;
+    WelsCabacContextInit (pCtx, pSlice->eSliceType, iCabacInitIdc, iQp);
+    //InitCabacCtx (pCtx->pCabacCtx, pSlice->eSliceType, iCabacInitIdc, iQp);
+    pSlice->iLastDeltaQp = 0;
+    WELS_READ_VERIFY (InitCabacDecEngineFromBS (pCtx->pCabacDecEngine, pCtx->pCurDqLayer->pBitStringAux));
+  }
+  //try to calculate  the dequant_coeff
+  WelsCalcDeqCoeffScalingList (pCtx);
+
+  iNextMbXyIndex = pSliceHeader->iFirstMbInSlice;
+  iMbX = iNextMbXyIndex % pCurDqLayer->iMbWidth;
+  iMbY = iNextMbXyIndex / pCurDqLayer->iMbWidth; // error is introduced by multiple slices case, 11/23/2009
+  pSlice->iMbSkipRun = -1;
+  iSliceIdc = (pSliceHeader->iFirstMbInSlice << 7) + pCurDqLayer->uiLayerDqId;
+
+  pCurDqLayer->iMbX = iMbX;
+  pCurDqLayer->iMbY = iMbY;
+  pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
+
+  PDeblockingFilterMbFunc pDeblockMb = WelsDeblockingMb;
+
+  SDeblockingFilter pFilter;
+  int32_t iFilterIdc = 1;
+  if (pCtx->pThreadCtx && pSliceHeader->uiDisableDeblockingFilterIdc != 1) {
+    WelsDeblockingInitFilter (pCtx, pFilter, iFilterIdc);
+  }
+
+  do {
+    if ((-1 == iNextMbXyIndex) || (iNextMbXyIndex >= kiCountNumMb)) { // slice group boundary or end of a frame
+      break;
+    }
+
+    pCurDqLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
+    pCtx->bMbRefConcealed = false;
+    iRet = pDecMbFunc (pCtx, pNalCur, uiEosFlag);
+    pCurDqLayer->pMbRefConcealedFlag[iNextMbXyIndex] = pCtx->bMbRefConcealed;
+    if (iRet != ERR_NONE) {
+      return iRet;
+    }
+    if (WelsTargetMbConstruction (pCtx)) {
+      WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING,
+               "WelsTargetSliceConstruction():::MB(%d, %d) construction error. pCurSlice_type:%d",
+               pCurDqLayer->iMbX, pCurDqLayer->iMbY, pSlice->eSliceType);
+
+      return ERR_INFO_MB_RECON_FAIL;
+    }
+    WelsDeblockingFilterMB (pCurDqLayer, pFilter, iFilterIdc, pDeblockMb);
+    if (pCtx->uiNalRefIdc > 0) {
+      if (pCurDqLayer->iMbX == 0 || pCurDqLayer->iMbX == pCurDqLayer->iMbWidth - 1 || pCurDqLayer->iMbY == 0
+          || pCurDqLayer->iMbY == pCurDqLayer->iMbHeight - 1) {
+        PadMBLuma_c (pCurDqLayer->pDec->pData[0], pCurDqLayer->pDec->iLinesize[0], pCurDqLayer->pDec->iWidthInPixel,
+                     pCurDqLayer->pDec->iHeightInPixel, pCurDqLayer->iMbX, pCurDqLayer->iMbY, pCurDqLayer->iMbWidth, pCurDqLayer->iMbHeight);
+        PadMBChroma_c (pCurDqLayer->pDec->pData[1], pCurDqLayer->pDec->iLinesize[1], pCurDqLayer->pDec->iWidthInPixel / 2,
+                       pCurDqLayer->pDec->iHeightInPixel / 2, pCurDqLayer->iMbX, pCurDqLayer->iMbY, pCurDqLayer->iMbWidth,
+                       pCurDqLayer->iMbHeight);
+        PadMBChroma_c (pCurDqLayer->pDec->pData[2], pCurDqLayer->pDec->iLinesize[2], pCurDqLayer->pDec->iWidthInPixel / 2,
+                       pCurDqLayer->pDec->iHeightInPixel / 2, pCurDqLayer->iMbX, pCurDqLayer->iMbY, pCurDqLayer->iMbWidth,
+                       pCurDqLayer->iMbHeight);
+      }
+    }
+    if (!pCurDqLayer->pMbCorrectlyDecodedFlag[iNextMbXyIndex]) { //already con-ed, overwrite
+      pCurDqLayer->pMbCorrectlyDecodedFlag[iNextMbXyIndex] = true;
+      pCtx->pDec->iMbEcedPropNum += (pCurDqLayer->pMbRefConcealedFlag[iNextMbXyIndex] ? 1 : 0);
+      ++pCtx->iTotalNumMbRec;
+    }
+
+    if (pCtx->iTotalNumMbRec > iTotalMbTargetLayer) {
+      WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING,
+               "WelsTargetSliceConstruction():::pCtx->iTotalNumMbRec:%d, iTotalMbTargetLayer:%d",
+               pCtx->iTotalNumMbRec, iTotalMbTargetLayer);
+
+      return ERR_INFO_MB_NUM_EXCEED_FAIL;
+    }
+
+    ++pSlice->iTotalMbInCurSlice;
+    if (uiEosFlag) { //end of slice
+      SET_EVENT (&pCtx->pDec->pReadyEvent[pCurDqLayer->iMbY]);
+      break;
+    }
+    if (pSliceHeader->pPps->uiNumSliceGroups > 1) {
+      iNextMbXyIndex = FmoNextMb (pFmo, iNextMbXyIndex);
+    } else {
+      ++iNextMbXyIndex;
+    }
+    int32_t iLastMby = iMbY;
+    int32_t iLastMbx = iMbX;
+    iMbX = iNextMbXyIndex % pCurDqLayer->iMbWidth;
+    iMbY = iNextMbXyIndex / pCurDqLayer->iMbWidth;
+    pCurDqLayer->iMbX = iMbX;
+    pCurDqLayer->iMbY = iMbY;
+    pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
+    if ((iMbY > iLastMby) && (iLastMbx == pCurDqLayer->iMbWidth - 1)) {
+      SET_EVENT (&pCtx->pDec->pReadyEvent[iLastMby]);
+    }
+  } while (1);
+  SET_EVENT (&pCtx->pDec->pReadyEvent[pCurDqLayer->iMbY]);
+  return ERR_NONE;
+}
+
 int32_t WelsActualDecodeMbCavlcISlice (PWelsDecoderContext pCtx) {
   SVlcTable* pVlcTable     = pCtx->pVlcTable;
   PDqLayer pCurDqLayer             = pCtx->pCurDqLayer;
@@ -1762,7 +1921,7 @@
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
 
   }
@@ -1782,8 +1941,8 @@
     pSlice->iLastMbQp = pCurDqLayer->pLumaQp[iMbXy];
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pSlice->iLastMbQp +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0,
-                                       51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0,
+                                         51)];
     }
 
 
@@ -1876,7 +2035,8 @@
         int32_t iIndex = 16 + (i << 2);
         for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
           if ((iRet = WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex, iScanIdxEnd - WELS_MAX (iScanIdxStart,
-                                              1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1), iMbResProperty, pCurDqLayer->pScaledTCoeff[iMbXy] + (iIndex << 4),
+                                              1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1), iMbResProperty,
+                                              pCurDqLayer->pScaledTCoeff[iMbXy] + (iIndex << 4),
                                               pCurDqLayer->pChromaQp[iMbXy][i], pCtx)) != ERR_NONE) {
             return iRet;//abnormal
           }
@@ -2126,11 +2286,12 @@
   ST32A4 (&pNzc[12], 0);
   ST32A4 (&pNzc[16], 0);
   ST32A4 (&pNzc[20], 0);
-  if (pCurDqLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16 (pCurDqLayer->pDec->pMbType[iMbXy]) && !IS_I_BL (pCurDqLayer->pDec->pMbType[iMbXy])) {
+  if (pCurDqLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16 (pCurDqLayer->pDec->pMbType[iMbXy])
+      && !IS_I_BL (pCurDqLayer->pDec->pMbType[iMbXy])) {
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
   }
 
@@ -2148,8 +2309,8 @@
     pSlice->iLastMbQp = pCurDqLayer->pLumaQp[iMbXy];
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pSlice->iLastMbQp +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0,
-                                       51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0,
+                                         51)];
     }
 
     BsStartCavlc (pBs);
@@ -2251,7 +2412,8 @@
         int32_t iIndex = 16 + (i << 2);
         for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
           if ((iRet = WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex, iScanIdxEnd - WELS_MAX (iScanIdxStart,
-                                              1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1), iMbResProperty, pCurDqLayer->pScaledTCoeff[iMbXy] + (iIndex << 4),
+                                              1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1), iMbResProperty,
+                                              pCurDqLayer->pScaledTCoeff[iMbXy] + (iIndex << 4),
                                               pCurDqLayer->pChromaQp[iMbXy][i], pCtx)) != ERR_NONE) {
             return iRet;//abnormal
           }
@@ -2305,7 +2467,9 @@
 
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[0][iMbXy], 0, sizeof (int8_t) * 16);
-    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[0] && ppRefPic[0]->bIsComplete);
+    bool bIsPending = pCtx->pThreadCtx != NULL;
+    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[0] && (ppRefPic[0]->bIsComplete
+                            || bIsPending));
     //predict iMv
     PredPSkipMvFromNeighbor (pCurDqLayer, iMv);
     for (i = 0; i < 16; i++) {
@@ -2322,7 +2486,7 @@
       pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
       for (i = 0; i < 2; i++) {
         pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                           pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
       }
     }
 
@@ -2400,8 +2564,9 @@
     pCurDqLayer->pInterPredictionDoneFlag[iMbXy] = 0;
     memset (pCurDqLayer->pDec->pRefIndex[LIST_0][iMbXy], 0, sizeof (int8_t) * 16);
     memset (pCurDqLayer->pDec->pRefIndex[LIST_1][iMbXy], 0, sizeof (int8_t) * 16);
-    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && ppRefPicL0[0]->bIsComplete)
-                            || ! (ppRefPicL1[0] && ppRefPicL1[0]->bIsComplete);
+    bool bIsPending = pCtx->pThreadCtx != NULL;
+    pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && (ppRefPicL0[0]->bIsComplete
+                            || bIsPending)) || ! (ppRefPicL1[0] && (ppRefPicL1[0]->bIsComplete || bIsPending));
 
     /*if (pCtx->bMbRefConcealed) {
       SLogContext* pLogCtx = & (pCtx->sLogCtx);
@@ -2435,7 +2600,7 @@
       pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
       for (i = 0; i < 2; i++) {
         pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                           pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
       }
     }
 
@@ -2665,11 +2830,12 @@
   ST32A4 (&pNzc[12], 0);
   ST32A4 (&pNzc[16], 0);
   ST32A4 (&pNzc[20], 0);
-  if (pCurDqLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16 (pCurDqLayer->pDec->pMbType[iMbXy]) && !IS_I_BL (pCurDqLayer->pDec->pMbType[iMbXy])) {
+  if (pCurDqLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16 (pCurDqLayer->pDec->pMbType[iMbXy])
+      && !IS_I_BL (pCurDqLayer->pDec->pMbType[iMbXy])) {
     pCurDqLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pCurDqLayer->pLumaQp[iMbXy] +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0, 51)];
     }
   }
 
@@ -2687,8 +2853,8 @@
     pSlice->iLastMbQp = pCurDqLayer->pLumaQp[iMbXy];
     for (i = 0; i < 2; i++) {
       pCurDqLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 (pSlice->iLastMbQp +
-                                       pSliceHeader->pPps->iChromaQpIndexOffset[i], 0,
-                                       51)];
+                                         pSliceHeader->pPps->iChromaQpIndexOffset[i], 0,
+                                         51)];
     }
 
     BsStartCavlc (pBs);
@@ -2790,7 +2956,8 @@
         int32_t iIndex = 16 + (i << 2);
         for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
           if ((iRet = WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex, iScanIdxEnd - WELS_MAX (iScanIdxStart,
-                                              1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1), iMbResProperty, pCurDqLayer->pScaledTCoeff[iMbXy] + (iIndex << 4),
+                                              1) + 1, g_kuiZigzagScan + WELS_MAX (iScanIdxStart, 1), iMbResProperty,
+                                              pCurDqLayer->pScaledTCoeff[iMbXy] + (iIndex << 4),
                                               pCurDqLayer->pChromaQp[iMbXy][i], pCtx)) != ERR_NONE) {
             return iRet;//abnormal
           }
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -52,6 +52,7 @@
 #include "decode_slice.h"
 #include "error_concealment.h"
 #include "memory_align.h"
+#include "wels_decoder_thread.h"
 
 namespace WelsDec {
 
@@ -321,12 +322,12 @@
 
   pCtx->pPicBuff          = NULL;
 
-  pCtx->sSpsPpsCtx.bAvcBasedFlag             = true;
+  //pCtx->sSpsPpsCtx.bAvcBasedFlag             = true;
   pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb = NULL;
   pCtx->pDecoderStatistics->iAvgLumaQp = -1;
   pCtx->pDecoderStatistics->iStatisticsLogInterval = 1000;
   pCtx->bUseScalingList = false;
-  pCtx->sSpsPpsCtx.iSpsErrorIgnored = 0;
+  /*pCtx->sSpsPpsCtx.iSpsErrorIgnored = 0;
   pCtx->sSpsPpsCtx.iSubSpsErrorIgnored = 0;
   pCtx->sSpsPpsCtx.iPpsErrorIgnored = 0;
   pCtx->sSpsPpsCtx.iPPSInvalidNum = 0;
@@ -335,6 +336,7 @@
   pCtx->sSpsPpsCtx.iSPSLastInvalidId = -1;
   pCtx->sSpsPpsCtx.iSubSPSInvalidNum = 0;
   pCtx->sSpsPpsCtx.iSubSPSLastInvalidId = -1;
+  */
   pCtx->iFeedbackNalRefIdc = -1; //initialize
   pCtx->pLastDecPicInfo->iPrevPicOrderCntMsb = 0;
   pCtx->pLastDecPicInfo->iPrevPicOrderCntLsb = 0;
@@ -372,6 +374,33 @@
   sLastDecPicInfo.bLastHasMmco5 = false;
 }
 
+/*!
+* \brief   copy SpsPps from one Ctx to another ctx for threaded code
+*/
+void CopySpsPps (PWelsDecoderContext pFromCtx, PWelsDecoderContext pToCtx) {
+  pToCtx->sSpsPpsCtx = pFromCtx->sSpsPpsCtx;
+  PAccessUnit pFromCurAu = pFromCtx->pAccessUnitList;
+  PSps pTmpLayerSps[MAX_LAYER_NUM];
+  for (int i = 0; i < MAX_LAYER_NUM; i++) {
+    pTmpLayerSps[i] = NULL;
+  }
+  // track the layer sps for the current au
+  for (unsigned int i = pFromCurAu->uiStartPos; i <= pFromCurAu->uiEndPos; i++) {
+    uint32_t uiDid = pFromCurAu->pNalUnitsList[i]->sNalHeaderExt.uiDependencyId;
+    pTmpLayerSps[uiDid] = pFromCurAu->pNalUnitsList[i]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
+    for (unsigned int j = 0; j < MAX_SPS_COUNT + 1; ++j) {
+      if (&pFromCtx->sSpsPpsCtx.sSpsBuffer[j] == pTmpLayerSps[uiDid]) {
+        pTmpLayerSps[uiDid] = &pToCtx->sSpsPpsCtx.sSpsBuffer[j];
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < MAX_LAYER_NUM; i++) {
+    if (pTmpLayerSps[i] != NULL) {
+      pToCtx->sSpsPpsCtx.pActiveLayerSps[i] = pTmpLayerSps[i];
+    }
+  }
+}
 
 /*
  *  destory_mb_blocks
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -2212,6 +2212,89 @@
 }
 
 /*
+* WelsDecodeInitAccessUnitStart
+* check and (re)allocate picture buffers on new sequence begin
+*  bit_len:    size in bit length of data
+*  buf_len:    size in byte length of data
+*  coded_au:   mark an Access Unit decoding finished
+* return:
+*  0 - success; otherwise returned error_no defined in error_no.h
+*/
+int32_t WelsDecodeInitAccessUnitStart (PWelsDecoderContext pCtx, SBufferInfo* pDstInfo) {
+  int32_t iErr = ERR_NONE;
+  PAccessUnit pCurAu = pCtx->pAccessUnitList;
+  pCtx->bAuReadyFlag = false;
+  pCtx->pLastDecPicInfo->bLastHasMmco5 = false;
+  bool bTmpNewSeqBegin = CheckNewSeqBeginAndUpdateActiveLayerSps (pCtx);
+  pCtx->bNewSeqBegin = pCtx->bNewSeqBegin || bTmpNewSeqBegin;
+  iErr = WelsDecodeAccessUnitStart (pCtx);
+  GetVclNalTemporalId (pCtx);
+
+  if (ERR_NONE != iErr) {
+    ForceResetCurrentAccessUnit (pCtx->pAccessUnitList);
+    if (!pCtx->pParam->bParseOnly)
+      pDstInfo->iBufferStatus = 0;
+    pCtx->bNewSeqBegin = pCtx->bNewSeqBegin || pCtx->bNextNewSeqBegin;
+    pCtx->bNextNewSeqBegin = false; // reset it
+    if (pCtx->bNewSeqBegin)
+      ResetActiveSPSForEachLayer (pCtx);
+    return iErr;
+  }
+
+  pCtx->pSps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
+  pCtx->pPps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pPps;
+
+  return iErr;
+}
+
+/*
+* AllocPicBuffOnNewSeqBegin
+* check and (re)allocate picture buffers on new sequence begin
+* return:
+*  0 - success; otherwise returned error_no defined in error_no.h
+*/
+int32_t AllocPicBuffOnNewSeqBegin (PWelsDecoderContext pCtx) {
+  //try to allocate or relocate DPB memory only when new sequence is coming.
+  if (pCtx->pThreadCtx == NULL) {
+    WelsResetRefPic (pCtx); //clear ref pPic when IDR NAL
+  }
+  int32_t iErr = SyncPictureResolutionExt (pCtx, pCtx->pSps->iMbWidth, pCtx->pSps->iMbHeight);
+
+  if (ERR_NONE != iErr) {
+    WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "sync picture resolution ext failed,  the error is %d", iErr);
+    return iErr;
+  }
+
+  return iErr;
+}
+
+/*
+* InitConstructAccessUnit
+* Init before constructing an access unit for given input bitstream, maybe partial NAL Unit, one or more Units are involved to
+* joint a collective access unit.
+* parameter\
+*  SBufferInfo:    Buffer info
+* return:
+*  0 - success; otherwise returned error_no defined in error_no.h
+*/
+int32_t InitConstructAccessUnit (PWelsDecoderContext pCtx, SBufferInfo* pDstInfo) {
+  int32_t iErr = ERR_NONE;
+
+  iErr = WelsDecodeInitAccessUnitStart (pCtx, pDstInfo);
+  if (ERR_NONE != iErr) {
+    return iErr;
+  }
+  if (pCtx->bNewSeqBegin) {
+    iErr = AllocPicBuffOnNewSeqBegin (pCtx);
+    if (ERR_NONE != iErr) {
+      return iErr;
+    }
+  }
+
+  return iErr;
+}
+
+/*
  * ConstructAccessUnit
  * construct an access unit for given input bitstream, maybe partial NAL Unit, one or more Units are involved to
  * joint a collective access unit.
--- a/codec/decoder/core/src/error_concealment.cpp
+++ b/codec/decoder/core/src/error_concealment.cpp
@@ -245,7 +245,7 @@
     }
     iMVs[0] = iFullMVx - (iMbXInPix << 2);
     iMVs[1] = iFullMVy - (iMbYInPix << 2);
-    BaseMC (pMCRefMem, iMbXInPix, iMbYInPix, &pCtx->sMcFunc, 16, 16, iMVs);
+    BaseMC (pCtx, pMCRefMem, -1, -1, iMbXInPix, iMbYInPix, &pCtx->sMcFunc, 16, 16, iMVs);
   }
   return;
 }
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -42,6 +42,7 @@
 #include "manage_dec_ref.h"
 #include "error_concealment.h"
 #include "error_code.h"
+#include "decoder.h"
 
 namespace WelsDec {
 
@@ -50,10 +51,10 @@
 static PPicture WelsDelShortFromListSetUnref (PRefPic pRefPic, int32_t iFrameNum);
 static PPicture WelsDelLongFromListSetUnref (PRefPic pRefPic, uint32_t uiLongTermFrameIdx);
 
-static int32_t MMCO (PWelsDecoderContext pCtx, PRefPicMarking pRefPicMarking);
-static int32_t MMCOProcess (PWelsDecoderContext pCtx, uint32_t uiMmcoType,
+static int32_t MMCO (PWelsDecoderContext pCtx, PRefPic pRefPic, PRefPicMarking pRefPicMarking);
+static int32_t MMCOProcess (PWelsDecoderContext pCtx, PRefPic pRefPic, uint32_t uiMmcoType,
                             int32_t iShortFrameNum, uint32_t uiLongTermPicNum, int32_t iLongTermFrameIdx, int32_t iMaxLongTermFrameIdx);
-static int32_t SlidingWindow (PWelsDecoderContext pCtx);
+static int32_t SlidingWindow (PWelsDecoderContext pCtx, PRefPic pRefPic);
 
 static int32_t AddShortTermToList (PRefPic pRefPic, PPicture pPic);
 static int32_t AddLongTermToList (PRefPic pRefPic, PPicture pPic, int32_t iLongTermFrameIdx, uint32_t uiLongTermPicNum);
@@ -63,7 +64,7 @@
 #ifdef LONG_TERM_REF
 int32_t GetLTRFrameIndex (PRefPic pRefPic, int32_t iAncLTRFrameNum);
 #endif
-static int32_t RemainOneBufferInDpbForEC (PWelsDecoderContext pCtx);
+static int32_t RemainOneBufferInDpbForEC (PWelsDecoderContext pCtx, PRefPic pRefPic);
 
 static void SetUnRef (PPicture pRef) {
   if (NULL != pRef) {
@@ -80,8 +81,14 @@
     pRef->iSpsId = -1;
     pRef->bIsComplete = false;
     for (int32_t i = 0; i < MAX_DPB_COUNT; ++i) {
-      pRef->pRefPic[LIST_0][i] = NULL;
-      pRef->pRefPic[LIST_1][i] = NULL;
+      if (pRef->pRefPic[LIST_0][i] != NULL) {
+        pRef->pRefPic[LIST_0][i]->bAvailableFlag = true;
+        pRef->pRefPic[LIST_0][i] = NULL;
+      }
+      if (pRef->pRefPic[LIST_1][i] != NULL) {
+        pRef->pRefPic[LIST_1][i]->bAvailableFlag = true;
+        pRef->pRefPic[LIST_1][i] = NULL;
+      }
     }
   }
 }
@@ -115,12 +122,32 @@
   pRefPic->uiLongRefCount[LIST_0] = 0;
 }
 
+void WelsResetRefPicWithoutUnRef (PWelsDecoderContext pCtx) {
+  int32_t i = 0;
+  PRefPic pRefPic = &pCtx->sRefPic;
+  pCtx->sRefPic.uiLongRefCount[LIST_0] = pCtx->sRefPic.uiShortRefCount[LIST_0] = 0;
+
+  pRefPic->uiRefCount[LIST_0] = 0;
+  pRefPic->uiRefCount[LIST_1] = 0;
+
+  for (i = 0; i < MAX_DPB_COUNT; i++) {
+    pRefPic->pShortRefList[LIST_0][i] = NULL;
+  }
+  pRefPic->uiShortRefCount[LIST_0] = 0;
+
+  for (i = 0; i < MAX_DPB_COUNT; i++) {
+    pRefPic->pLongRefList[LIST_0][i] = NULL;
+  }
+  pRefPic->uiLongRefCount[LIST_0] = 0;
+}
+
 static int32_t WelsCheckAndRecoverForFutureDecoding (PWelsDecoderContext pCtx) {
-  if ((pCtx->sRefPic.uiShortRefCount[LIST_0] + pCtx->sRefPic.uiLongRefCount[LIST_0] <= 0) && (pCtx->eSliceType != I_SLICE
-      && pCtx->eSliceType != SI_SLICE)) {
+  if ((pCtx->sRefPic.uiShortRefCount[LIST_0] + pCtx->sRefPic.uiLongRefCount[LIST_0] <= 0)
+      && (pCtx->eSliceType != I_SLICE
+          && pCtx->eSliceType != SI_SLICE)) {
     if (pCtx->pParam->eEcActiveIdc !=
         ERROR_CON_DISABLE) { //IDR lost!, recover it for future decoding with data all set to 0
-      PPicture pRef = PrefetchPic (pCtx->pPicBuff);
+      PPicture pRef = pCtx->pThreadCtx != NULL ? PrefetchPicForThread (pCtx->pPicBuff) : PrefetchPic (pCtx->pPicBuff);
       if (pRef != NULL) {
         // IDR lost, set new
         pRef->bIsComplete = false; // Set complete flag to false for lost IDR ref picture
@@ -536,13 +563,20 @@
 
     for (i = WELS_MAX (1, WELS_MAX (iCount, pCtx->sRefPic.uiRefCount[listIdx])); i < iRefCount; i++)
       ppRefList[i] = ppRefList[i - 1];
-    pCtx->sRefPic.uiRefCount[listIdx] = (uint8_t)WELS_MIN (WELS_MAX (iCount, pCtx->sRefPic.uiRefCount[listIdx]), iRefCount);
+    pCtx->sRefPic.uiRefCount[listIdx] = (uint8_t)WELS_MIN (WELS_MAX (iCount, pCtx->sRefPic.uiRefCount[listIdx]),
+                                        iRefCount);
   }
   return ERR_NONE;
 }
 
-int32_t WelsMarkAsRef (PWelsDecoderContext pCtx) {
-  PRefPic pRefPic = &pCtx->sRefPic;
+int32_t WelsMarkAsRef (PWelsDecoderContext pCtx, PPicture pLastDec) {
+  PPicture pDec = pLastDec;
+  bool isThreadCtx = true;
+  if (pDec == NULL) {
+    pDec = pCtx->pDec;
+    isThreadCtx = false;
+  }
+  PRefPic pRefPic = isThreadCtx ? &pCtx->sTmpRefPic : &pCtx->sRefPic;
   PRefPicMarking pRefPicMarking = pCtx->pCurDqLayer->pRefPicMarking;
   PAccessUnit pCurAU = pCtx->pAccessUnitList;
   bool bIsIDRAU = false;
@@ -550,10 +584,10 @@
 
   int32_t iRet = ERR_NONE;
 
-  pCtx->pDec->uiQualityId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
-  pCtx->pDec->uiTemporalId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiTemporalId;
-  pCtx->pDec->iSpsId = pCtx->pSps->iSpsId;
-  pCtx->pDec->iPpsId = pCtx->pPps->iPpsId;
+  pDec->uiQualityId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
+  pDec->uiTemporalId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiTemporalId;
+  pDec->iSpsId = pCtx->pSps->iSpsId;
+  pDec->iPpsId = pCtx->pPps->iPpsId;
 
   for (j = pCurAU->uiStartPos; j <= pCurAU->uiEndPos; j++) {
     if (pCurAU->pNalUnitsList[j]->sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR
@@ -564,17 +598,17 @@
   }
   if (bIsIDRAU) {
     if (pRefPicMarking->bLongTermRefFlag) {
-      pCtx->sRefPic.iMaxLongTermFrameIdx = 0;
-      AddLongTermToList (pRefPic, pCtx->pDec, 0, 0);
+      pRefPic->iMaxLongTermFrameIdx = 0;
+      AddLongTermToList (pRefPic, pDec, 0, 0);
     } else {
-      pCtx->sRefPic.iMaxLongTermFrameIdx = -1;
+      pRefPic->iMaxLongTermFrameIdx = -1;
     }
   } else {
     if (pRefPicMarking->bAdaptiveRefPicMarkingModeFlag) {
-      iRet = MMCO (pCtx, pRefPicMarking);
+      iRet = MMCO (pCtx, pRefPic, pRefPicMarking);
       if (iRet != ERR_NONE) {
         if (pCtx->pParam->eEcActiveIdc != ERROR_CON_DISABLE) {
-          iRet = RemainOneBufferInDpbForEC (pCtx);
+          iRet = RemainOneBufferInDpbForEC (pCtx, pRefPic);
           WELS_VERIFY_RETURN_IF (iRet, iRet);
         } else {
           return iRet;
@@ -582,15 +616,15 @@
       }
 
       if (pCtx->pLastDecPicInfo->bLastHasMmco5) {
-        pCtx->pDec->iFrameNum = 0;
-        pCtx->pDec->iFramePoc = 0;
+        pDec->iFrameNum = 0;
+        pDec->iFramePoc = 0;
       }
 
     } else {
-      iRet = SlidingWindow (pCtx);
+      iRet = SlidingWindow (pCtx, pRefPic);
       if (iRet != ERR_NONE) {
         if (pCtx->pParam->eEcActiveIdc != ERROR_CON_DISABLE) {
-          iRet = RemainOneBufferInDpbForEC (pCtx);
+          iRet = RemainOneBufferInDpbForEC (pCtx, pRefPic);
           WELS_VERIFY_RETURN_IF (iRet, iRet);
         } else {
           return iRet;
@@ -599,22 +633,22 @@
     }
   }
 
-  if (!pCtx->pDec->bIsLongRef) {
+  if (!pDec->bIsLongRef) {
     if (pRefPic->uiLongRefCount[LIST_0] + pRefPic->uiShortRefCount[LIST_0] >= WELS_MAX (1, pCtx->pSps->iNumRefFrames)) {
       if (pCtx->pParam->eEcActiveIdc != ERROR_CON_DISABLE) {
-        iRet = RemainOneBufferInDpbForEC (pCtx);
+        iRet = RemainOneBufferInDpbForEC (pCtx, pRefPic);
         WELS_VERIFY_RETURN_IF (iRet, iRet);
       } else {
         return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
       }
     }
-    iRet = AddShortTermToList (pRefPic, pCtx->pDec);
+    iRet = AddShortTermToList (pRefPic, pDec);
   }
 
   return iRet;
 }
 
-static int32_t MMCO (PWelsDecoderContext pCtx, PRefPicMarking pRefPicMarking) {
+static int32_t MMCO (PWelsDecoderContext pCtx, PRefPic pRefPic, PRefPicMarking pRefPicMarking) {
   PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
   int32_t i = 0;
   int32_t iRet = ERR_NONE;
@@ -628,7 +662,8 @@
     if (uiMmcoType > MMCO_LONG) {
       return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
     }
-    iRet = MMCOProcess (pCtx, uiMmcoType, iShortFrameNum, uiLongTermPicNum, iLongTermFrameIdx, iMaxLongTermFrameIdx);
+    iRet = MMCOProcess (pCtx, pRefPic, uiMmcoType, iShortFrameNum, uiLongTermPicNum, iLongTermFrameIdx,
+                        iMaxLongTermFrameIdx);
     if (iRet != ERR_NONE) {
       return iRet;
     }
@@ -639,9 +674,8 @@
 
   return ERR_NONE;
 }
-static int32_t MMCOProcess (PWelsDecoderContext pCtx, uint32_t uiMmcoType,
+static int32_t MMCOProcess (PWelsDecoderContext pCtx, PRefPic pRefPic, uint32_t uiMmcoType,
                             int32_t iShortFrameNum, uint32_t uiLongTermPicNum, int32_t iLongTermFrameIdx, int32_t iMaxLongTermFrameIdx) {
-  PRefPic pRefPic = &pCtx->sRefPic;
   PPicture pPic = NULL;
   int32_t i = 0;
   int32_t iRet = ERR_NONE;
@@ -713,13 +747,12 @@
   return iRet;
 }
 
-static int32_t SlidingWindow (PWelsDecoderContext pCtx) {
-  PRefPic pRefPic = &pCtx->sRefPic;
+static int32_t SlidingWindow (PWelsDecoderContext pCtx, PRefPic pRefPic) {
   PPicture pPic = NULL;
   int32_t i = 0;
 
-  if (pCtx->sRefPic.uiShortRefCount[LIST_0] + pCtx->sRefPic.uiLongRefCount[LIST_0] >= pCtx->pSps->iNumRefFrames) {
-    if (pCtx->sRefPic.uiShortRefCount[LIST_0] == 0) {
+  if (pRefPic->uiShortRefCount[LIST_0] + pRefPic->uiLongRefCount[LIST_0] >= pCtx->pSps->iNumRefFrames) {
+    if (pRefPic->uiShortRefCount[LIST_0] == 0) {
       WelsLog (& (pCtx->sLogCtx), WELS_LOG_ERROR, "No reference picture in short term list when sliding window");
       return ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH;
     }
@@ -756,7 +789,6 @@
       break;
     }
   }
-
   return pPic;
 }
 
@@ -882,14 +914,13 @@
 }
 #endif
 
-static int32_t RemainOneBufferInDpbForEC (PWelsDecoderContext pCtx) {
+static int32_t RemainOneBufferInDpbForEC (PWelsDecoderContext pCtx, PRefPic pRefPic) {
   int32_t iRet = ERR_NONE;
-  PRefPic pRefPic = &pCtx->sRefPic;
   if (pRefPic->uiShortRefCount[0] + pRefPic->uiLongRefCount[0] < pCtx->pSps->iNumRefFrames)
     return iRet;
 
   if (pRefPic->uiShortRefCount[0] > 0) {
-    iRet = SlidingWindow (pCtx);
+    iRet = SlidingWindow (pCtx, pRefPic);
   } else { //all LTR, remove the smallest long_term_frame_idx
     int32_t iLongTermFrameIdx = 0;
     int32_t iMaxLongTermFrameIdx = pRefPic->iMaxLongTermFrameIdx;
--- a/codec/decoder/core/src/mv_pred.cpp
+++ b/codec/decoder/core/src/mv_pred.cpp
@@ -315,6 +315,14 @@
   mbType = GetMbType (pCurDqLayer)[iMbXy];
 
   PPicture colocPic = pCtx->sRefPic.pRefList[LIST_1][0];
+  if (pCtx->pThreadCtx != NULL) {
+    if (16 * pCurDqLayer->iMbY > pCtx->lastReadyHeightOffset[1][0]) {
+      if (colocPic->pReadyEvent[pCurDqLayer->iMbY].isSignaled != 1) {
+        WAIT_EVENT (&colocPic->pReadyEvent[pCurDqLayer->iMbY], WELS_DEC_THREAD_WAIT_INFINITE);
+      }
+      pCtx->lastReadyHeightOffset[1][0] = 16 * pCurDqLayer->iMbY;
+    }
+  }
 
   if (colocPic == NULL) {
     SLogContext* pLogCtx = & (pCtx->sLogCtx);
--- a/codec/decoder/core/src/parse_mb_syn_cabac.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cabac.cpp
@@ -35,6 +35,7 @@
 #include "mv_pred.h"
 #include "error_code.h"
 #include <stdio.h>
+
 namespace WelsDec {
 #define IDX_UNUSED -1
 
@@ -534,6 +535,8 @@
   pRefCount[0] = pSliceHeader->uiRefCount[0];
   pRefCount[1] = pSliceHeader->uiRefCount[1];
 
+  bool bIsPending = pCtx->pThreadCtx != NULL;
+
   switch (pCurDqLayer->pDec->pMbType[iMbXy]) {
   case MB_TYPE_16x16: {
     iPartIdx = 0;
@@ -549,7 +552,7 @@
       }
     }
     pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRef[0]]
-                            && ppRefPic[iRef[0]]->bIsComplete);
+                            && (ppRefPic[iRef[0]]->bIsComplete || bIsPending));
     PredMv (pMotionVector, pRefIndex, LIST_0, 0, 4, iRef[0], pMv);
     WELS_READ_VERIFY (ParseMvdInfoCabac (pCtx, pNeighAvail, pRefIndex, pMvdCache, iPartIdx, LIST_0, 0, pMvd[0]));
     WELS_READ_VERIFY (ParseMvdInfoCabac (pCtx, pNeighAvail, pRefIndex, pMvdCache, iPartIdx, LIST_0, 1, pMvd[1]));
@@ -575,7 +578,7 @@
         }
       }
       pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRef[i]]
-                              && ppRefPic[iRef[i]]->bIsComplete);
+                              && (ppRefPic[iRef[i]]->bIsComplete || bIsPending));
       UpdateP16x8RefIdxCabac (pCurDqLayer, pRefIndex, iPartIdx, iRef[i], LIST_0);
     }
     for (i = 0; i < 2; i++) {
@@ -605,7 +608,7 @@
         }
       }
       pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRef[i]]
-                              && ppRefPic[iRef[i]]->bIsComplete);
+                              && (ppRefPic[iRef[i]]->bIsComplete || bIsPending));
       UpdateP8x16RefIdxCabac (pCurDqLayer, pRefIndex, iPartIdx, iRef[i], LIST_0);
     }
     for (i = 0; i < 2; i++) {
@@ -653,7 +656,7 @@
         }
       }
       pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[pRefIdx[i]]
-                              && ppRefPic[pRefIdx[i]]->bIsComplete);
+                              && (ppRefPic[pRefIdx[i]]->bIsComplete || bIsPending));
       UpdateP8x8RefIdxCabac (pCurDqLayer, pRefIndex, iIdx8, pRefIdx[i], LIST_0);
     }
     //mv
@@ -738,6 +741,8 @@
 
   MbType mbType = pCurDqLayer->pDec->pMbType[iMbXy];
 
+  bool bIsPending = pCtx->pThreadCtx != NULL;
+
   if (IS_DIRECT (mbType)) {
 
     int16_t pMvDirect[LIST_A][2] = { { 0, 0 }, { 0, 0 } };
@@ -774,7 +779,7 @@
           }
         }
         pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (pCtx->sRefPic.pRefList[listIdx][iRef[listIdx]]
-                                && pCtx->sRefPic.pRefList[listIdx][iRef[listIdx]]->bIsComplete);
+                                && (pCtx->sRefPic.pRefList[listIdx][iRef[listIdx]]->bIsComplete || bIsPending));
       }
     }
     for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
@@ -811,7 +816,7 @@
             }
           }
           pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (pCtx->sRefPic.pRefList[listIdx][ref_idx]
-                                  && pCtx->sRefPic.pRefList[listIdx][ref_idx]->bIsComplete);
+                                  && (pCtx->sRefPic.pRefList[listIdx][ref_idx]->bIsComplete || bIsPending));
         }
         UpdateP16x8RefIdxCabac (pCurDqLayer, pRefIndex, iPartIdx, ref_idx, listIdx);
         ref_idx_list[listIdx][i] = ref_idx;
@@ -855,7 +860,7 @@
             }
           }
           pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (pCtx->sRefPic.pRefList[listIdx][ref_idx]
-                                  && pCtx->sRefPic.pRefList[listIdx][ref_idx]->bIsComplete);
+                                  && (pCtx->sRefPic.pRefList[listIdx][ref_idx]->bIsComplete || bIsPending));
         }
         UpdateP8x16RefIdxCabac (pCurDqLayer, pRefIndex, iPartIdx, ref_idx, listIdx);
         ref_idx_list[listIdx][i] = ref_idx;
@@ -989,7 +994,7 @@
               }
             }
             pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (pCtx->sRefPic.pRefList[listIdx][iref]
-                                    && pCtx->sRefPic.pRefList[listIdx][iref]->bIsComplete);
+                                    && (pCtx->sRefPic.pRefList[listIdx][iref]->bIsComplete || bIsPending));
           }
           Update8x8RefIdx (pCurDqLayer, iIdx8, listIdx, iref);
           ref_idx_list[listIdx][i] = iref;
--- a/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
@@ -1083,6 +1083,8 @@
   iRefCount[0] = pSliceHeader->uiRefCount[0];
   iRefCount[1] = pSliceHeader->uiRefCount[1];
 
+  bool bIsPending = pCtx->pThreadCtx != NULL;
+
   switch (pCurDqLayer->pDec->pMbType[iMbXy]) {
   case MB_TYPE_16x16: {
     int32_t iRefIdx = 0;
@@ -1105,7 +1107,7 @@
         }
       }
       pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx]
-                              && ppRefPic[iRefIdx]->bIsComplete);
+                              && (ppRefPic[iRefIdx]->bIsComplete || bIsPending));
     } else {
       WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
       return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1146,7 +1148,7 @@
         }
       }
       pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx[i]]
-                              && ppRefPic[iRefIdx[i]]->bIsComplete);
+                              && (ppRefPic[iRefIdx[i]]->bIsComplete || bIsPending));
     }
     for (i = 0; i < 2; i++) {
       PredInter16x8Mv (iMvArray, iRefIdxArray, LIST_0, i << 3, iRefIdx[i], iMv);
@@ -1183,7 +1185,7 @@
           }
         }
         pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx[i]]
-                                && ppRefPic[iRefIdx[i]]->bIsComplete);
+                                && (ppRefPic[iRefIdx[i]]->bIsComplete || bIsPending));
       } else {
         WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
         return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1255,10 +1257,11 @@
             }
           }
           pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx[i]]
-                                  && ppRefPic[iRefIdx[i]]->bIsComplete);
+                                  && (ppRefPic[iRefIdx[i]]->bIsComplete || bIsPending));
 
           pCurDqLayer->pDec->pRefIndex[0][iMbXy][uiScan4Idx  ] = pCurDqLayer->pDec->pRefIndex[0][iMbXy][uiScan4Idx + 1] =
-                pCurDqLayer->pDec->pRefIndex[0][iMbXy][uiScan4Idx + 4] = pCurDqLayer->pDec->pRefIndex[0][iMbXy][uiScan4Idx + 5] = iRefIdx[i];
+                pCurDqLayer->pDec->pRefIndex[0][iMbXy][uiScan4Idx + 4] = pCurDqLayer->pDec->pRefIndex[0][iMbXy][uiScan4Idx + 5] =
+                      iRefIdx[i];
         } else {
           WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
           return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1345,6 +1348,8 @@
   iRefCount[0] = pSliceHeader->uiRefCount[0];
   iRefCount[1] = pSliceHeader->uiRefCount[1];
 
+  bool bIsPending = pCtx->pThreadCtx != NULL;
+
   MbType mbType = pCurDqLayer->pDec->pMbType[iMbXy];
   if (IS_DIRECT (mbType)) {
 
@@ -1390,7 +1395,7 @@
             }
           }
           pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[listIdx][ref_idx_list[listIdx][0]]
-                                  && ppRefPic[listIdx][ref_idx_list[listIdx][0]]->bIsComplete);
+                                  && (ppRefPic[listIdx][ref_idx_list[listIdx][0]]->bIsComplete || bIsPending));
         } else {
           WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
           return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1440,7 +1445,7 @@
             }
             ref_idx_list[listIdx][i] = iRefIdx;
             pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[listIdx][iRefIdx]
-                                    && ppRefPic[listIdx][iRefIdx]->bIsComplete);
+                                    && (ppRefPic[listIdx][iRefIdx]->bIsComplete || bIsPending));
           } else {
             WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
             return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1499,7 +1504,7 @@
             }
             ref_idx_list[listIdx][i] = iRefIdx;
             pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[listIdx][iRefIdx]
-                                    && ppRefPic[listIdx][iRefIdx]->bIsComplete);
+                                    && (ppRefPic[listIdx][iRefIdx]->bIsComplete || bIsPending));
           } else {
             WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
             return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1644,7 +1649,7 @@
                 }
               }
               pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[listIdx][iref]
-                                      && ppRefPic[listIdx][iref]->bIsComplete);
+                                      && (ppRefPic[listIdx][iref]->bIsComplete || bIsPending));
             } else {
               WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
               return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
--- a/codec/decoder/core/src/pic_queue.cpp
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -121,6 +121,15 @@
                               int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[]");
   pPic->pRefIndex[LIST_1] = (int8_t (*)[16])pMa->WelsMallocz (uiMbCount * sizeof (
                               int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[]");
+  if (pCtx->pThreadCtx != NULL) {
+    pPic->pReadyEvent = (SWelsDecEvent*)pMa->WelsMallocz (uiMbHeight * sizeof (SWelsDecEvent), "pPic->pReadyEvent");
+    for (uint32_t i = 0; i < uiMbHeight; ++i) {
+      CREATE_EVENT (&pPic->pReadyEvent[i], 1, 0, NULL);
+    }
+  } else {
+    pPic->pReadyEvent = NULL;
+  }
+
   return pPic;
 }
 
@@ -147,6 +156,14 @@
         pPic->pRefIndex[listIdx] = NULL;
       }
     }
+    if (pPic->pReadyEvent != NULL) {
+      uint32_t uiMbHeight = (pPic->iHeightInPixel + 15) >> 4;
+      for (uint32_t i = 0; i < uiMbHeight; ++i) {
+        CLOSE_EVENT (&pPic->pReadyEvent[i]);
+      }
+      pMa->WelsFree (pPic->pReadyEvent, "pPic->pReadyEvent");
+      pPic->pReadyEvent = NULL;
+    }
     pMa->WelsFree (pPic, "pPic");
     pPic = NULL;
   }
@@ -182,6 +199,20 @@
   pPicBuf->iCurrentIdx = iPicIdx;
   if (pPic != NULL) {
     pPic->iPicBuffIdx = iPicIdx;
+  }
+  return pPic;
+}
+
+PPicture PrefetchPicForThread (PPicBuff pPicBuf) {
+  PPicture pPic = NULL;
+
+  if (pPicBuf->iCapacity == 0) {
+    return NULL;
+  }
+  pPic = pPicBuf->ppPic[pPicBuf->iCurrentIdx];
+  pPic->iPicBuffIdx = pPicBuf->iCurrentIdx;
+  if (++pPicBuf->iCurrentIdx >= pPicBuf->iCapacity) {
+    pPicBuf->iCurrentIdx = 0;
   }
   return pPic;
 }
--- a/codec/decoder/core/src/rec_mb.cpp
+++ b/codec/decoder/core/src/rec_mb.cpp
@@ -214,11 +214,10 @@
 
 
 //according to current 8*8 block ref_index to gain reference picture
-static inline int32_t GetRefPic (sMCRefMember* pMCRefMem, PWelsDecoderContext pCtx, int8_t* pRefIdxList,
-                                 int32_t iIndex, int32_t listIdx) {
+static inline int32_t GetRefPic (sMCRefMember* pMCRefMem, PWelsDecoderContext pCtx, const int8_t& iRefIdx,
+                                 int32_t listIdx) {
   PPicture pRefPic;
 
-  int8_t iRefIdx = pRefIdxList[iIndex];
   if (iRefIdx >= 0) {
     pRefPic = pCtx->sRefPic.pRefList[listIdx][iRefIdx];
 
@@ -242,7 +241,9 @@
 #ifndef MC_FLOW_SIMPLE_JUDGE
 #define MC_FLOW_SIMPLE_JUDGE 1
 #endif //MC_FLOW_SIMPLE_JUDGE
-void BaseMC (sMCRefMember* pMCRefMem, int32_t iXOffset, int32_t iYOffset, SMcFunc* pMCFunc,
+void BaseMC (PWelsDecoderContext pCtx, sMCRefMember* pMCRefMem, const int32_t& listIdx, const int8_t& iRefIdx,
+             int32_t iXOffset, int32_t iYOffset,
+             SMcFunc* pMCFunc,
              int32_t iBlkWidth, int32_t iBlkHeight, int16_t iMVs[2]) {
   int32_t iFullMVx = (iXOffset << 2) + iMVs[0]; //quarter pixel
   int32_t iFullMVy = (iYOffset << 2) + iMVs[1];
@@ -251,6 +252,27 @@
   iFullMVy = WELS_CLIP3 (iFullMVy, ((-PADDING_LENGTH + 2) * (1 << 2)),
                          ((pMCRefMem->iPicHeight + PADDING_LENGTH - 19) * (1 << 2)));
 
+  if (pCtx->pThreadCtx != NULL && iRefIdx >= 0) {
+    // wait for the lines of reference macroblock (3 + 16).
+    PPicture pRefPic = pCtx->sRefPic.pRefList[listIdx][iRefIdx];
+    if (pCtx->bNewSeqBegin && (pCtx->iErrorCode & dsRefLost)) {
+      //set event if refpic is lost to prevent from infinite waiting.
+      if (!pRefPic->pReadyEvent[0].isSignaled) {
+        for (uint32_t ln = 0; ln < pCtx->sMb.iMbHeight; ++ln) {
+          SET_EVENT (&pRefPic->pReadyEvent[ln]);
+        }
+      }
+    }
+    int32_t offset = (iFullMVy >> 2) + iBlkHeight + 3 + 16;
+    if (offset > pCtx->lastReadyHeightOffset[listIdx][iRefIdx]) {
+      const int32_t down_line = WELS_MIN (offset >> 4, int32_t (pCtx->sMb.iMbHeight) - 1);
+      if (pRefPic->pReadyEvent[down_line].isSignaled != 1) {
+        WAIT_EVENT (&pRefPic->pReadyEvent[down_line], WELS_DEC_THREAD_WAIT_INFINITE);
+      }
+      pCtx->lastReadyHeightOffset[listIdx][iRefIdx] = offset;
+    }
+  }
+
   int32_t iSrcPixOffsetLuma = (iFullMVx >> 2) + (iFullMVy >> 2) * pMCRefMem->iSrcLineLuma;
   int32_t iSrcPixOffsetChroma = (iFullMVx >> 3) + (iFullMVy >> 3) * pMCRefMem->iSrcLineChroma;
 
@@ -466,7 +488,7 @@
   pMCRefMem.iDstLineLuma   = iDstLineLuma;
   pMCRefMem.iDstLineChroma = iDstLineChroma;
 
-  int32_t iRefIndex = 0;
+  int8_t iRefIndex = 0;
 
   switch (iMBType) {
   case MB_TYPE_SKIP:
@@ -473,8 +495,9 @@
   case MB_TYPE_16x16:
     iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][0][0];
     iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][0][1];
-    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[0][iMBXY], 0, LIST_0));
-    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
+    iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][0];
+    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, LIST_0));
+    BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
 
     if (pCurDqLayer->bUseWeightPredictionFlag) {
       iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][0];
@@ -484,24 +507,24 @@
   case MB_TYPE_16x8:
     iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][0][0];
     iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][0][1];
-    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[0][iMBXY], 0, LIST_0));
-    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 8, iMVs);
+    iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][0];
+    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, LIST_0));
+    BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 8, iMVs);
 
     if (pCurDqLayer->bUseWeightPredictionFlag) {
-      iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][0];
       WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 16, 8);
     }
 
     iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][8][0];
     iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][8][1];
-    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[0][iMBXY], 8, LIST_0));
+    iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][8];
+    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, LIST_0));
     pMCRefMem.pDstY = pPredY  + (iDstLineLuma << 3);
     pMCRefMem.pDstU = pPredCb + (iDstLineChroma << 2);
     pMCRefMem.pDstV = pPredCr + (iDstLineChroma << 2);
-    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY + 8, pMCFunc, 16, 8, iMVs);
+    BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iMBOffsetX, iMBOffsetY + 8, pMCFunc, 16, 8, iMVs);
 
     if (pCurDqLayer->bUseWeightPredictionFlag) {
-      iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][8];
       WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 16, 8);
     }
     break;
@@ -508,23 +531,23 @@
   case MB_TYPE_8x16:
     iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][0][0];
     iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][0][1];
-    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[0][iMBXY], 0, LIST_0));
-    BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 8, 16, iMVs);
+    iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][0];
+    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, LIST_0));
+    BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iMBOffsetX, iMBOffsetY, pMCFunc, 8, 16, iMVs);
     if (pCurDqLayer->bUseWeightPredictionFlag) {
-      iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][0];
       WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 8, 16);
     }
 
     iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][2][0];
     iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][2][1];
-    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[0][iMBXY], 2, LIST_0));
+    iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][2];
+    WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, LIST_0));
     pMCRefMem.pDstY = pPredY + 8;
     pMCRefMem.pDstU = pPredCb + 4;
     pMCRefMem.pDstV = pPredCr + 4;
-    BaseMC (&pMCRefMem, iMBOffsetX + 8, iMBOffsetY, pMCFunc, 8, 16, iMVs);
+    BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iMBOffsetX + 8, iMBOffsetY, pMCFunc, 8, 16, iMVs);
 
     if (pCurDqLayer->bUseWeightPredictionFlag) {
-      iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][2];
       WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 8, 16);
     }
     break;
@@ -541,9 +564,8 @@
       iYOffset = iMBOffsetY + iBlk8Y;
 
       iIIdx = ((i >> 1) << 3) + ((i & 1) << 1);
-      WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[0][iMBXY], iIIdx, LIST_0));
-      iRefIndex = pCurDqLayer->bUseWeightPredictionFlag ? pCurDqLayer->pDec->pRefIndex[0][iMBXY][iIIdx] : 0;
-
+      iRefIndex = pCurDqLayer->pDec->pRefIndex[0][iMBXY][iIIdx];
+      WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, LIST_0));
       pDstY = pPredY + iBlk8X + iBlk8Y * iDstLineLuma;
       pDstU = pPredCb + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
       pDstV = pPredCr + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
@@ -554,7 +576,7 @@
       case SUB_MB_TYPE_8x8:
         iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx][0];
         iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx][1];
-        BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
+        BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
         if (pCurDqLayer->bUseWeightPredictionFlag) {
 
           WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 8, 8);
@@ -564,7 +586,7 @@
       case SUB_MB_TYPE_8x4:
         iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx][0];
         iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx][1];
-        BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
+        BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
         if (pCurDqLayer->bUseWeightPredictionFlag) {
 
           WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 8, 4);
@@ -576,7 +598,7 @@
         pMCRefMem.pDstY += (iDstLineLuma << 2);
         pMCRefMem.pDstU += (iDstLineChroma << 1);
         pMCRefMem.pDstV += (iDstLineChroma << 1);
-        BaseMC (&pMCRefMem, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
+        BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
         if (pCurDqLayer->bUseWeightPredictionFlag) {
 
           WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 8, 4);
@@ -586,7 +608,7 @@
       case SUB_MB_TYPE_4x8:
         iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx][0];
         iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx][1];
-        BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
+        BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
         if (pCurDqLayer->bUseWeightPredictionFlag) {
 
           WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 4, 8);
@@ -598,7 +620,7 @@
         pMCRefMem.pDstY += 4;
         pMCRefMem.pDstU += 2;
         pMCRefMem.pDstV += 2;
-        BaseMC (&pMCRefMem, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
+        BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
         if (pCurDqLayer->bUseWeightPredictionFlag) {
 
           WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 4, 8);
@@ -620,7 +642,7 @@
 
           iMVs[0] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx + iJIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[0][iMBXY][iIIdx + iJIdx][1];
-          BaseMC (&pMCRefMem, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
+          BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
           if (pCurDqLayer->bUseWeightPredictionFlag) {
 
             WeightPrediction (pCurDqLayer, &pMCRefMem, LIST_0, iRefIndex, 4, 4);
@@ -677,8 +699,9 @@
   pTempMCRefMem.pDstV = pTempPredYCbCr[2];
 
 
-  int32_t iRefIndex1 = 0;
-  int32_t iRefIndex2 = 0;
+  int8_t iRefIndex0 = 0;
+  int8_t iRefIndex1 = 0;
+  int8_t iRefIndex = 0;
 
   bool bWeightedBipredIdcIs1 = pCurDqLayer->sLayerInfo.pPps->uiWeightedBipredIdc == 1;
 
@@ -686,17 +709,17 @@
     if (IS_TYPE_L0 (iMBType) && IS_TYPE_L1 (iMBType)) {
       iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][0][0];
       iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][0][1];
-      WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY], 0, LIST_0));
-      BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
+      iRefIndex0 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][0];
+      WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex0, LIST_0));
+      BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
 
       iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][0][0];
       iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][0][1];
-      WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY], 0, LIST_1));
-      BaseMC (&pTempMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
-      iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][0];
-      iRefIndex2 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][0];
+      iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][0];
+      WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, iRefIndex1, LIST_1));
+      BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
       if (pCurDqLayer->bUseWeightedBiPredIdc) {
-        BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 16, 16);
+        BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 16, 16);
       } else {
         BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem,  16, 16);
       }
@@ -704,10 +727,10 @@
       int32_t listIdx = (iMBType & MB_TYPE_P0L0) ? LIST_0 : LIST_1;
       iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][0][0];
       iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][0][1];
-      WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY], 0, listIdx));
-      BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
+      iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][0];
+      WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, listIdx));
+      BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
       if (bWeightedBipredIdcIs1) {
-        int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][0];
         WeightPrediction (pCurDqLayer, &pMCRefMem, listIdx, iRefIndex, 16, 16);
       }
     }
@@ -721,27 +744,29 @@
           lastListIdx = listIdx;
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iPartIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iPartIdx][1];
-          WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY], iPartIdx, listIdx));
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iPartIdx];
+          WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, listIdx));
           if (i) {
             pMCRefMem.pDstY += (iDstLineLuma << 3);
             pMCRefMem.pDstU += (iDstLineChroma << 2);
             pMCRefMem.pDstV += (iDstLineChroma << 2);
           }
-          BaseMC (&pMCRefMem, iMBOffsetX, iMBOffsetY + iPartIdx, pMCFunc, 16, 8, iMVs);
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iMBOffsetX, iMBOffsetY + iPartIdx, pMCFunc, 16, 8, iMVs);
           if (++listCount == 2) {
             iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iPartIdx][0];
             iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iPartIdx][1];
-            WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY], iPartIdx, LIST_1));
+            iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][iPartIdx];
+            WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, iRefIndex1, LIST_1));
             if (i) {
               pTempMCRefMem.pDstY += (iDstLineLuma << 3);
               pTempMCRefMem.pDstU += (iDstLineChroma << 2);
               pTempMCRefMem.pDstV += (iDstLineChroma << 2);
             }
-            BaseMC (&pTempMCRefMem, iMBOffsetX, iMBOffsetY + iPartIdx, pMCFunc, 16, 8, iMVs);
+            BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iMBOffsetX, iMBOffsetY + iPartIdx, pMCFunc, 16, 8, iMVs);
             if (pCurDqLayer->bUseWeightedBiPredIdc) {
-              iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][iPartIdx];
-              iRefIndex2 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][iPartIdx];
-              BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 16, 8);
+              iRefIndex0 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][iPartIdx];
+              iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][iPartIdx];
+              BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 16, 8);
             } else {
               BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, 16, 8);
             }
@@ -750,7 +775,7 @@
       }
       if (listCount == 1) {
         if (bWeightedBipredIdcIs1) {
-          int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[lastListIdx][iMBXY][iPartIdx];
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[lastListIdx][iMBXY][iPartIdx];
           WeightPrediction (pCurDqLayer, &pMCRefMem, lastListIdx, iRefIndex, 16, 8);
         }
       }
@@ -764,27 +789,29 @@
           lastListIdx = listIdx;
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][i << 1][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][i << 1][1];
-          WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY], i << 1, listIdx));
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][i << 1];
+          WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, listIdx));
           if (i) {
             pMCRefMem.pDstY += 8;
             pMCRefMem.pDstU += 4;
             pMCRefMem.pDstV += 4;
           }
-          BaseMC (&pMCRefMem, iMBOffsetX + (i ? 8 : 0), iMBOffsetY, pMCFunc, 8, 16, iMVs);
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iMBOffsetX + (i ? 8 : 0), iMBOffsetY, pMCFunc, 8, 16, iMVs);
           if (++listCount == 2) {
             iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][i << 1][0];
             iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][i << 1][1];
-            WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY], i << 1, LIST_1));
+            iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][i << 1];
+            WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, iRefIndex1, LIST_1));
             if (i) {
               pTempMCRefMem.pDstY += 8;
               pTempMCRefMem.pDstU += 4;
               pTempMCRefMem.pDstV += 4;
             }
-            BaseMC (&pTempMCRefMem, iMBOffsetX + (i ? 8 : 0), iMBOffsetY, pMCFunc, 8, 16, iMVs);
+            BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iMBOffsetX + (i ? 8 : 0), iMBOffsetY, pMCFunc, 8, 16, iMVs);
             if (pCurDqLayer->bUseWeightedBiPredIdc) {
-              iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][i << 1];
-              iRefIndex2 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][i << 1];
-              BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 8, 16);
+              iRefIndex0 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][i << 1];
+              iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][i << 1];
+              BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 8, 16);
             } else {
               BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, 8, 16);
             }
@@ -793,7 +820,7 @@
       }
       if (listCount == 1) {
         if (bWeightedBipredIdcIs1) {
-          int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[lastListIdx][iMBXY][i << 1];
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[lastListIdx][iMBXY][i << 1];
           WeightPrediction (pCurDqLayer, &pMCRefMem, lastListIdx, iRefIndex, 8, 16);
         }
       }
@@ -830,15 +857,15 @@
       pTempMCRefMem.pDstV = pDstV2;
 
       if ((IS_TYPE_L0 (iSubMBType) && IS_TYPE_L1 (iSubMBType))) {
-        iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][iIIdx];
-        WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY], iIIdx, LIST_0));
+        iRefIndex0 = pCurDqLayer->pDec->pRefIndex[LIST_0][iMBXY][iIIdx];
+        WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex0, LIST_0));
 
-        iRefIndex2 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][iIIdx];
-        WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY], iIIdx, LIST_1));
+        iRefIndex1 = pCurDqLayer->pDec->pRefIndex[LIST_1][iMBXY][iIIdx];
+        WELS_B_MB_REC_VERIFY (GetRefPic (&pTempMCRefMem, pCtx, iRefIndex1, LIST_1));
       } else {
         int32_t listIdx = IS_TYPE_L0 (iSubMBType) ? LIST_0 : LIST_1;
-        iRefIndex1 = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
-        WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY], iIIdx, listIdx));
+        iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
+        WELS_B_MB_REC_VERIFY (GetRefPic (&pMCRefMem, pCtx, iRefIndex, listIdx));
       }
 
       if (IS_SUB_8x8 (iSubMBType)) {
@@ -845,14 +872,14 @@
         if (IS_TYPE_L0 (iSubMBType) && IS_TYPE_L1 (iSubMBType)) {
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
+          BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
 
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx][1];
-          BaseMC (&pTempMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
+          BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
 
           if (pCurDqLayer->bUseWeightedBiPredIdc) {
-            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 8, 8);
+            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 8, 8);
           } else {
             BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem,  8, 8);
           }
@@ -860,9 +887,9 @@
           int32_t listIdx = IS_TYPE_L0 (iSubMBType) ? LIST_0 : LIST_1;
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs);
           if (bWeightedBipredIdcIs1) {
-            int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
             WeightPrediction (pCurDqLayer, &pMCRefMem, listIdx, iRefIndex, 8, 8);
           }
         }
@@ -870,13 +897,13 @@
         if (IS_TYPE_L0 (iSubMBType) && IS_TYPE_L1 (iSubMBType)) { //B_Bi_8x4
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
+          BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx][1];
-          BaseMC (&pTempMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
+          BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
 
           if (pCurDqLayer->bUseWeightedBiPredIdc) {
-            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 8, 4);
+            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 8, 4);
           } else {
             BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem,  8, 4);
           }
@@ -886,7 +913,7 @@
           pMCRefMem.pDstV += (iDstLineChroma << 1);
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx + 4][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx + 4][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
+          BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
 
           pTempMCRefMem.pDstY += (iDstLineLuma << 2);
           pTempMCRefMem.pDstU += (iDstLineChroma << 1);
@@ -893,10 +920,10 @@
           pTempMCRefMem.pDstV += (iDstLineChroma << 1);
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx + 4][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx + 4][1];
-          BaseMC (&pTempMCRefMem, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
+          BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
 
           if (pCurDqLayer->bUseWeightedBiPredIdc) {
-            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 8, 4);
+            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 8, 4);
           } else {
             BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem,  8, 4);
           }
@@ -904,15 +931,15 @@
           int32_t listIdx = IS_TYPE_L0 (iSubMBType) ? LIST_0 : LIST_1;
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
           pMCRefMem.pDstY += (iDstLineLuma << 2);
           pMCRefMem.pDstU += (iDstLineChroma << 1);
           pMCRefMem.pDstV += (iDstLineChroma << 1);
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx + 4][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx + 4][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iXOffset, iYOffset + 4, pMCFunc, 8, 4, iMVs);
           if (bWeightedBipredIdcIs1) {
-            int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
             WeightPrediction (pCurDqLayer, &pMCRefMem, listIdx, iRefIndex, 8, 4);
           }
         }
@@ -920,13 +947,13 @@
         if (IS_TYPE_L0 (iSubMBType) && IS_TYPE_L1 (iSubMBType)) { //B_Bi_4x8
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
+          BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx][1];
-          BaseMC (&pTempMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
+          BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
 
           if (pCurDqLayer->bUseWeightedBiPredIdc) {
-            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 4, 8);
+            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 4, 8);
           } else {
             BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem,  4, 8);
           }
@@ -936,7 +963,7 @@
           pMCRefMem.pDstV += 2;
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx + 1][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx + 1][1];
-          BaseMC (&pMCRefMem, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
+          BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
 
           pTempMCRefMem.pDstY += 4;
           pTempMCRefMem.pDstU += 2;
@@ -943,10 +970,10 @@
           pTempMCRefMem.pDstV += 2;
           iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx + 1][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx + 1][1];
-          BaseMC (&pTempMCRefMem, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
+          BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
 
           if (pCurDqLayer->bUseWeightedBiPredIdc) {
-            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 4, 8);
+            BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 4, 8);
           } else {
             BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, 4, 8);
           }
@@ -954,15 +981,15 @@
           int32_t listIdx = IS_TYPE_L0 (iSubMBType) ? LIST_0 : LIST_1;
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx][1];
-          BaseMC (&pMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
           pMCRefMem.pDstY += 4;
           pMCRefMem.pDstU += 2;
           pMCRefMem.pDstV += 2;
           iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx + 1][0];
           iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx + 1][1];
-          BaseMC (&pMCRefMem, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
+          BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iXOffset + 4, iYOffset, pMCFunc, 4, 8, iMVs);
           if (bWeightedBipredIdcIs1) {
-            int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
             WeightPrediction (pCurDqLayer, &pMCRefMem, listIdx, iRefIndex, 4, 8);
           }
         }
@@ -982,7 +1009,7 @@
 
             iMVs[0] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx + iJIdx][0];
             iMVs[1] = pCurDqLayer->pDec->pMv[LIST_0][iMBXY][iIIdx + iJIdx][1];
-            BaseMC (&pMCRefMem, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
+            BaseMC (pCtx, &pMCRefMem, LIST_0, iRefIndex0, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
 
             pTempMCRefMem.pDstY = pDstY2 + iBlk8X + iBlk8Y * iDstLineLuma;
             pTempMCRefMem.pDstU = pDstU2 + iUVLineStride;
@@ -990,10 +1017,10 @@
 
             iMVs[0] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx + iJIdx][0];
             iMVs[1] = pCurDqLayer->pDec->pMv[LIST_1][iMBXY][iIIdx + iJIdx][1];
-            BaseMC (&pTempMCRefMem, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
+            BaseMC (pCtx, &pTempMCRefMem, LIST_1, iRefIndex1, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
 
             if (pCurDqLayer->bUseWeightedBiPredIdc) {
-              BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex1, iRefIndex2, bWeightedBipredIdcIs1, 4, 4);
+              BiWeightPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem, iRefIndex0, iRefIndex1, bWeightedBipredIdcIs1, 4, 4);
             } else {
               BiPrediction (pCurDqLayer, &pMCRefMem, &pTempMCRefMem,  4, 4);
             }
@@ -1000,7 +1027,7 @@
           }
         } else {
           int32_t listIdx = IS_TYPE_L0 (iSubMBType) ? LIST_0 : LIST_1;
-          int32_t iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
+          iRefIndex = pCurDqLayer->pDec->pRefIndex[listIdx][iMBXY][iIIdx];
           for (int32_t j = 0; j < 4; j++) {
             int32_t iUVLineStride;
             iJIdx = ((j >> 1) << 2) + (j & 1);
@@ -1015,7 +1042,7 @@
 
             iMVs[0] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx + iJIdx][0];
             iMVs[1] = pCurDqLayer->pDec->pMv[listIdx][iMBXY][iIIdx + iJIdx][1];
-            BaseMC (&pMCRefMem, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
+            BaseMC (pCtx, &pMCRefMem, listIdx, iRefIndex, iXOffset + iBlk4X, iYOffset + iBlk4Y, pMCFunc, 4, 4, iMVs);
             if (bWeightedBipredIdcIs1) {
               WeightPrediction (pCurDqLayer, &pMCRefMem, listIdx, iRefIndex, 4, 4);
             }