shithub: openh264

Download patch

ref: 4b8913f3390fd6d1b7276de32248fdd3261ed0aa
parent: 94ffa1f51776e8ec5b035ca765aa6e1d928cecc4
author: xiaotiansf <xiaotianshimail@gmail.com>
date: Thu Jan 3 09:34:43 EST 2019

Update and fix temporal direct prediction and fix multi-reference picure re-ordering.

--- a/codec/decoder/core/inc/mv_pred.h
+++ b/codec/decoder/core/inc/mv_pred.h
@@ -111,7 +111,8 @@
 /*!
 * \brief   get the motion predictor for B-slice temporal direct mode 16x16
 */
-int32_t PredBDirectTemporal (PWelsDecoderContext pCtx, int16_t iMvp[LIST_A][2], int8_t ref[LIST_A]);
+int32_t PredBDirectTemporal (PWelsDecoderContext pCtx, int16_t iMvp[LIST_A][2], int8_t ref[LIST_A],
+                             SubMbType& subMbType);
 
 /*!
 * \brief   get the motion params for B-slice spatial direct mode
@@ -142,6 +143,39 @@
  */
 void PredInter8x16Mv (int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30],
                       int32_t listIdx, int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
+
+/*!
+* \brief   Fill the spatial direct motion vectors for 8x8 direct MB
+* \param
+* \param   output motion vector cache and motion vector deviation cache
+*/
+void FillSpatialDirect8x8Mv (PDqLayer pCurLayer, const int16_t& iIdx8, const int8_t& iPartCount, const int8_t& iPartW,
+                             const SubMbType& subMbType, const bool& bIsLongRef, int16_t pMvDirect[LIST_A][2], int8_t iRef[LIST_A],
+                             int16_t pMotionVector[LIST_A][30][MV_A], int16_t pMvdCache[LIST_A][30][MV_A]);
+
+/*!
+* \brief   Fill the temporal direct motion vectors for 8x8 direct MB
+* \param
+* \param   output motion vector cache and motion vector deviation cache
+*/
+void FillTemporalDirect8x8Mv (PDqLayer pCurLayer, const int16_t& iIdx8, const int8_t& iPartCount, const int8_t& iPartW,
+                              const SubMbType& subMbType, int8_t iRef[LIST_A], int16_t (*mvColoc)[2],
+                              int16_t pMotionVector[LIST_A][30][MV_A], int16_t pMvdCache[LIST_A][30][MV_A]);
+
+/*!
+* \brief   returns ref_index in List_0 from the colocated ref_index in LIST_0.
+* \param
+*  returns ref_index in List_0 of ref picture LIST_0
+*/
+int8_t MapColToList0 (PWelsDecoderContext& pCtx, const int8_t& colocRefIndexL0,
+                      const int32_t& ref0Count); //ISO/IEC 14496-10:2009(E) (8-193)
+
+/*!
+* \brief     update ref_index cache for current MB, for 8x8
+* \param
+* \param
+*/
+void Update8x8RefIdx (PDqLayer& pCurDqLayer, const int16_t& iPartIdx, const int32_t& listIdx, const int8_t& iRef);
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/parse_mb_syn_cabac.h
+++ b/codec/decoder/core/inc/parse_mb_syn_cabac.h
@@ -82,6 +82,8 @@
                                const int8_t iListIdx);
 void    UpdateP8x8DirectCabac (PDqLayer pCurDqLayer, int32_t iPartIdx);
 void    UpdateP16x16DirectCabac (PDqLayer pCurDqLayer);
+void    UpdateP8x8RefCacheIdxCabac (int8_t pRefIndex[LIST_A][30], const int16_t& iPartIdx, const int32_t& listIdx,
+                                    const int8_t& iRef);
 }
 //#pragma pack()
 #endif
--- a/codec/decoder/core/inc/slice.h
+++ b/codec/decoder/core/inc/slice.h
@@ -122,7 +122,8 @@
   PPps            pPps;
   int32_t         iSpsId;
   int32_t         iPpsId;
-  bool bIdrFlag;
+  bool            bIdrFlag;
+  bool            bIsRefPic;
 
   /*********************got from other layer for efficency if possible*********************/
   SRefPicListReorderSyn   pRefPicListReordering;  // Reference picture list reordering syntaxs
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -54,12 +54,6 @@
 
 extern void FreePicture (PPicture pPic, CMemoryAlign* pMa);
 
-static inline int32_t iAbs (int32_t x) {
-  static const int32_t INT_BITS = (sizeof (int) * CHAR_BIT) - 1;
-  int32_t y = x >> INT_BITS;
-  return (x ^ y) - y;
-}
-
 extern PPicture AllocPicture (PWelsDecoderContext pCtx, const int32_t kiPicWidth, const int32_t kiPicHeight);
 
 int32_t WelsTargetSliceConstruction (PWelsDecoderContext pCtx) {
@@ -1423,11 +1417,10 @@
     pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && ppRefPicL0[0]->bIsComplete)
                             || ! (ppRefPicL1[0] && ppRefPicL1[0]->bIsComplete);
 
-
+    SubMbType subMbType;
     if (pSliceHeader->iDirectSpatialMvPredFlag) {
 
       //predict direct spatial mv
-      SubMbType subMbType;
       int32_t ret = PredMvBDirectSpatial (pCtx, pMv, ref, subMbType);
       if (ret != ERR_NONE) {
         return ret;
@@ -1434,8 +1427,7 @@
       }
     } else {
       //temporal direct mode
-      ComputeColocated (pCtx);
-      int32_t ret = PredBDirectTemporal (pCtx, pMv, ref);
+      int32_t ret = PredBDirectTemporal (pCtx, pMv, ref, subMbType);
       if (ret != ERR_NONE) {
         return ret;
       }
@@ -2386,10 +2378,10 @@
     pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPicL0[0] && ppRefPicL0[0]->bIsComplete)
                             || ! (ppRefPicL1[0] && ppRefPicL1[0]->bIsComplete);
     //predict iMv
+    SubMbType subMbType;
     if (pSliceHeader->iDirectSpatialMvPredFlag) {
 
       //predict direct spatial mv
-      SubMbType subMbType;
       int32_t ret = PredMvBDirectSpatial (pCtx, iMv, ref, subMbType);
       if (ret != ERR_NONE) {
         return ret;
@@ -2396,8 +2388,7 @@
       }
     } else {
       //temporal direct mode
-      ComputeColocated (pCtx);
-      int32_t ret = PredBDirectTemporal (pCtx, iMv, ref);
+      int32_t ret = PredBDirectTemporal (pCtx, iMv, ref, subMbType);
       if (ret != ERR_NONE) {
         return ret;
       }
@@ -2838,17 +2829,21 @@
   PSlice pCurSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
   PSliceHeader pSliceHeader = &pCurSlice->sSliceHeaderExt.sSliceHeader;
   if (!pSliceHeader->iDirectSpatialMvPredFlag) {
-    uint32_t uiShortRefCount = pCtx->sRefPic.uiShortRefCount[LIST_0];
-    for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
-      for (uint32_t i = 0; i < uiShortRefCount; ++i) {
-        int32_t iTRb = WELS_CLIP3 (-128, 127, pSliceHeader->iPicOrderCntLsb - pCtx->sRefPic.pRefList[listIdx][i]->iFramePoc);
-        int32_t iTRp = WELS_CLIP3 (-128, 127,
-                                   pCtx->sRefPic.pRefList[LIST_1][i]->iFramePoc - pCtx->sRefPic.pRefList[LIST_0][i]->iFramePoc);
-        if (iTRp != 0) {
-          int32_t prescale = (16384 + iAbs (iTRp / 2)) / iTRp;
-          pCurSlice->iMvScale[listIdx][i] = WELS_CLIP3 (-1024, 1023, (iTRb * prescale + 32) >> 6);
-        } else {
-          pCurSlice->iMvScale[listIdx][i] = 0x03FFF;
+    uint32_t uiRefCount = pSliceHeader->uiRefCount[LIST_0];
+    if (pCtx->sRefPic.pRefList[LIST_1][0] != NULL) {
+      for (uint32_t i = 0; i < uiRefCount; ++i) {
+        if (pCtx->sRefPic.pRefList[LIST_0][i] != NULL) {
+          const int32_t poc0 = pCtx->sRefPic.pRefList[LIST_0][i]->iFramePoc;
+          const int32_t poc1 = pCtx->sRefPic.pRefList[LIST_1][0]->iFramePoc;
+          const int32_t poc = pSliceHeader->iPicOrderCntLsb;
+          const int32_t td = WELS_CLIP3 (poc1 - poc0, -128, 127);
+          if (td == 0) {
+            pCurSlice->iMvScale[LIST_0][i] = 1 << 8;
+          } else {
+            int32_t tb = WELS_CLIP3 (poc - poc0, -128, 127);
+            int32_t tx = (16384 + (abs (td) >> 1)) / td;
+            pCurSlice->iMvScale[LIST_0][i] = WELS_CLIP3 ((tb * tx + 32) >> 6, -1024, 1023);
+          }
         }
       }
     }
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -873,6 +873,8 @@
 
   pSliceHeadExt = &kpCurNal->sNalData.sVclNal.sSliceHeaderExt;
 
+  pSliceHead->bIsRefPic = false;
+
   if (pSliceHeadExt) {
     SRefBasePicMarking sBaseMarking;
     const bool kbStoreRefBaseFlag = pSliceHeadExt->bStoreRefBasePicFlag;
@@ -2473,6 +2475,7 @@
       dq_cur->pBitStringAux = &pNalCur->sNalData.sVclNal.sSliceBitsRead;
 
       uiNalRefIdc = pNalCur->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc;
+      pSh->bIsRefPic = uiNalRefIdc > 0;
 
       iPpsId = pSh->iPpsId;
 
@@ -2547,6 +2550,9 @@
             }
           }
         }
+        //calculate Colocated mv scaler factor for temporal direct prediction
+        if (pSh->eSliceType == B_SLICE && !pSh->iDirectSpatialMvPredFlag)
+          ComputeColocated (pCtx);
 
         iRet = WelsDecodeSlice (pCtx, bFreshSliceAvailable, pNalCur);
 
@@ -2645,8 +2651,10 @@
         memcpy (pCtx->pDec->pRefIndex[LIST_1], pCtx->pCurDqLayer->pRefIndex[LIST_1],
                 pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * MB_BLOCK4x4_NUM);
         for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
-          for (uint32_t i = 0; i < pCtx->sRefPic.uiRefCount[listIdx]; ++i) {
+          uint32_t i = 0;
+          while (pCtx->sRefPic.pRefList[listIdx][i]) {
             pCtx->pDec->pRefPic[listIdx][i] = pCtx->sRefPic.pRefList[listIdx][i];
+            ++i;
           }
         }
         iRet = WelsMarkAsRef (pCtx);
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -346,8 +346,8 @@
   for (int32_t listIdx = 0; listIdx < ListCount; ++listIdx) {
     PPicture pPic = NULL;
     PPicture* ppRefList = pCtx->sRefPic.pRefList[listIdx];
-    int32_t iMaxRefIdx = pCtx->pSps->iNumRefFrames;
-    int32_t iRefCount = pCtx->sRefPic.uiRefCount[listIdx];
+    int32_t iMaxRefIdx = pCtx->iPicQueueNumber;
+    int32_t iRefCount = pSliceHeader->uiRefCount[listIdx];
     int32_t iPredFrameNum = pSliceHeader->iFrameNum;
     int32_t iMaxPicNum = 1 << pSliceHeader->pSps->uiLog2MaxFrameNum;
     int32_t iAbsDiffPicNum = -1;
--- a/codec/decoder/core/src/mv_pred.cpp
+++ b/codec/decoder/core/src/mv_pred.cpp
@@ -584,75 +584,7 @@
           pSubPartCount[i] = 4;
           pPartW[i] = 1;
         }
-
-        int8_t iPartCount = pSubPartCount[i];
-        int16_t iPartIdx, iBlockW = pPartW[i];
-
-        for (int32_t j = 0; j < iPartCount; j++) {
-          iPartIdx = iIdx8 + j * iBlockW;
-          uint8_t iScan4Idx = g_kuiScan4[iPartIdx];
-          uint8_t iColocIdx = g_kuiScan4[iPartIdx];
-          //uint8_t iCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-
-          int16_t pMV[4] = { 0 };
-          if (IS_SUB_8x8 (subMbType)) {
-            * (uint32_t*)pMV = * (uint32_t*)iMvp[LIST_0];
-            ST32 ((pMV + 2), LD32 (pMV));
-            ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
-            ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
-            ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-            ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-            * (uint32_t*)pMV = * (uint32_t*)iMvp[LIST_1];
-            ST32 ((pMV + 2), LD32 (pMV));
-            ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
-            ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
-            ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-            ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-          } else { //SUB_4x4
-            * (uint32_t*)pMV = * (uint32_t*)iMvp[LIST_0];
-            ST32 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (pMV));
-            ST32 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-            * (uint32_t*)pMV = * (uint32_t*)iMvp[LIST_1];
-            ST32 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (pMV));
-            ST32 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-          }
-          if ((* (int32_t*)iMvp[LIST_0] | * (int32_t*)iMvp[LIST_1])) {
-            uint32_t uiColZeroFlag = (0 == pCurLayer->iColocIntra[iColocIdx]) && !bIsLongRef &&
-                                     (pCurLayer->iColocRefIndex[LIST_0][iColocIdx] == 0 || (pCurLayer->iColocRefIndex[LIST_0][iColocIdx] < 0
-                                         && pCurLayer->iColocRefIndex[LIST_1][iColocIdx] == 0));
-            const int16_t (*mvColoc)[2] = 0 == pCurLayer->iColocRefIndex[LIST_0][iColocIdx] ? pCurLayer->iColocMv[LIST_0] :
-                                          pCurLayer->iColocMv[LIST_1];
-            const int16_t* mv = mvColoc[iColocIdx];
-            if (IS_SUB_8x8 (subMbType)) {
-              if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
-                if (ref[LIST_0] == 0) {
-                  ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
-                  ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], 0);
-                  ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-                  ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-                }
-
-                if (ref[LIST_1] == 0) {
-                  ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
-                  ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], 0);
-                  ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-                  ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-                }
-              }
-            } else {
-              if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
-                if (ref[LIST_0] == 0) {
-                  ST32 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
-                  ST32 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-                }
-                if (ref[LIST_1] == 0) {
-                  ST32 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
-                  ST32 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-                }
-              }
-            }
-          }
-        }
+        FillSpatialDirect8x8Mv (pCurLayer, iIdx8, pSubPartCount[i], pPartW[i], subMbType, bIsLongRef, iMvp, ref, NULL, NULL);
       }
     }
   }
@@ -659,21 +591,29 @@
   return ret;
 }
 
-int32_t PredBDirectTemporal (PWelsDecoderContext pCtx, int16_t iMvp[LIST_A][2], int8_t ref[LIST_A]) {
+int32_t PredBDirectTemporal (PWelsDecoderContext pCtx, int16_t iMvp[LIST_A][2], int8_t ref[LIST_A],
+                             SubMbType& subMbType) {
   int32_t ret = ERR_NONE;
   PDqLayer pCurLayer = pCtx->pCurDqLayer;
   int32_t iMbXy = pCurLayer->iMbXyIndex;
   bool bSkipOrDirect = (IS_SKIP (pCurLayer->pMbType[iMbXy]) | IS_DIRECT (pCurLayer->pMbType[iMbXy])) > 0;
+
   MbType mbType;
-  SubMbType subMbType;
   ret = GetColocatedMb (pCtx, mbType, subMbType);
   if (ret != ERR_NONE) {
     return ret;
   }
+
+  pCurLayer->pMbType[iMbXy] = mbType;
+
   PSlice pSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
+  PSliceHeader pSliceHeader = &pSlice->sSliceHeaderExt.sSliceHeader;
+  int16_t pMvd[4] = { 0 };
+  const int32_t ref0Count = WELS_MIN (pSliceHeader->uiRefCount[LIST_0], pCtx->sRefPic.uiRefCount[LIST_0]);
   if (IS_INTER_16x16 (mbType)) {
     ref[LIST_0] = 0;
     ref[LIST_1] = 0;
+    UpdateP16x16DirectCabac (pCurLayer);
     UpdateP16x16RefIdx (pCurLayer, LIST_1, ref[LIST_1]);
     ST64 (iMvp,  0);
     if (pCurLayer->iColocIntra[0]) {
@@ -681,10 +621,14 @@
       UpdateP16x16MotionOnly (pCurLayer, LIST_1, iMvp[LIST_1]);
       UpdateP16x16RefIdx (pCurLayer, LIST_0, ref[LIST_0]);
     } else {
-      ref[LIST_0] = pCurLayer->iColocRefIndex[LIST_0][0] >= 0 ? pCurLayer->iColocRefIndex[LIST_0][0] :
-                    pCurLayer->iColocRefIndex[LIST_1][0];
-      const int16_t (*mvColoc)[2] = 0 == ref[LIST_0] ? pCurLayer->iColocMv[LIST_0] : pCurLayer->iColocMv[LIST_1];
-      const int16_t* mv = mvColoc[0];
+      ref[LIST_0] = 0;
+      int16_t* mv = pCurLayer->iColocMv[LIST_0][0];
+      int8_t colocRefIndexL0 = pCurLayer->iColocRefIndex[LIST_0][0];
+      if (colocRefIndexL0 >= 0) {
+        ref[LIST_0] = MapColToList0 (pCtx, colocRefIndexL0, ref0Count);
+      } else {
+        mv = pCurLayer->iColocMv[LIST_1][0];
+      }
       UpdateP16x16RefIdx (pCurLayer, LIST_0, ref[LIST_0]);
 
       iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][ref[LIST_0]] * mv[0] + 128) >> 8;
@@ -694,26 +638,35 @@
       iMvp[LIST_1][1] = iMvp[LIST_0][1] - mv[1];
       UpdateP16x16MotionOnly (pCurLayer, LIST_1, iMvp[LIST_1]);
     }
+    UpdateP16x16MvdCabac (pCurLayer, pMvd, LIST_0);
+    UpdateP16x16MvdCabac (pCurLayer, pMvd, LIST_1);
   } else {
     if (bSkipOrDirect) {
       int8_t pSubPartCount[4], pPartW[4];
+      int8_t pRefIndex[LIST_A][30];
       for (int32_t i = 0; i < 4; i++) {
         int16_t iIdx8 = i << 2;
+        const uint8_t iScan4Idx = g_kuiScan4[iIdx8];
         pCurLayer->pSubMbType[iMbXy][i] = subMbType;
 
+        int16_t (*mvColoc)[2] = pCurLayer->iColocMv[LIST_0];
+
         ref[LIST_1] = 0;
-        if (pCurLayer->iColocIntra[g_kuiScan4[iIdx8]]) {
+        UpdateP8x8RefIdxCabac (pCurLayer, pRefIndex, iIdx8, ref[LIST_1], LIST_1);
+        if (pCurLayer->iColocIntra[iScan4Idx]) {
           ref[LIST_0] = 0;
+          UpdateP8x8RefIdxCabac (pCurLayer, pRefIndex, iIdx8, ref[LIST_0], LIST_0);
+          ST64 (iMvp, 0);
         } else {
-          if (pCurLayer->iColocRefIndex[LIST_0][iIdx8] >= 0) {
-            ref[LIST_0] = pCurLayer->iColocRefIndex[LIST_0][iIdx8];
+          ref[LIST_0] = 0;
+          int8_t colocRefIndexL0 = pCurLayer->iColocRefIndex[LIST_0][iScan4Idx];
+          if (colocRefIndexL0 >= 0) {
+            ref[LIST_0] = MapColToList0 (pCtx, colocRefIndexL0, ref0Count);
           } else {
-            ref[LIST_0] = pCurLayer->iColocRefIndex[LIST_1][iIdx8];
+            mvColoc = pCurLayer->iColocMv[LIST_1];
           }
+          UpdateP8x8RefIdxCabac (pCurLayer, pRefIndex, iIdx8, ref[LIST_0], LIST_0);
         }
-        int8_t pRefIndex[LIST_A][30];
-        UpdateP8x8RefIdxCabac (pCurLayer, pRefIndex, iIdx8, ref[LIST_0], LIST_0);
-        UpdateP8x8RefIdxCabac (pCurLayer, pRefIndex, iIdx8, ref[LIST_1], LIST_1);
         UpdateP8x8DirectCabac (pCurLayer, iIdx8);
 
         pSubPartCount[i] = g_ksInterBSubMbTypeInfo[0].iPartCount;
@@ -723,46 +676,7 @@
           pSubPartCount[i] = 4;
           pPartW[i] = 1;
         }
-
-        int8_t iPartCount = pSubPartCount[i];
-        int16_t iPartIdx, iBlockW = pPartW[i];
-        for (int32_t j = 0; j < iPartCount; j++) {
-          iPartIdx = iIdx8 + j * iBlockW;
-          uint8_t iScan4Idx = g_kuiScan4[iPartIdx];
-          uint8_t iColocIdx = g_kuiScan4[iPartIdx];
-
-          int16_t (*mvColoc)[2] = pCurLayer->iColocMv[LIST_0];
-          int16_t* mv = mvColoc[iColocIdx];
-
-          int16_t pMV[4] = { 0 };
-          if (IS_SUB_8x8 (subMbType)) {
-            iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][ref[LIST_0]] * mv[0] + 128) >> 8;
-            iMvp[LIST_0][1] = (pSlice->iMvScale[LIST_0][ref[LIST_0]] * mv[1] + 128) >> 8;
-            ST32 (pMV, LD32 (iMvp[LIST_0]));
-            ST32 ((pMV + 2), LD32 (iMvp[LIST_0]));
-            ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
-            ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
-            ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-            ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-            iMvp[LIST_1][0] -= iMvp[LIST_0][0] - mv[0];
-            iMvp[LIST_1][1] -= iMvp[LIST_0][0] - mv[1];
-            ST32 (pMV, LD32 (iMvp[LIST_1]));
-            ST32 ((pMV + 2), LD32 (iMvp[LIST_1]));
-            ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
-            ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
-            ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-            ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-          } else { //SUB_4x4
-            iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][ref[LIST_0]] * mv[0] + 128) >> 8;
-            iMvp[LIST_0][1] = (pSlice->iMvScale[LIST_0][ref[LIST_0]] * mv[1] + 128) >> 8;
-            ST32 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (iMvp[LIST_0]));
-            ST32 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-            iMvp[LIST_1][0] -= iMvp[LIST_0][0] - mv[0];
-            iMvp[LIST_1][1] -= iMvp[LIST_0][0] - mv[1];
-            ST32 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (iMvp[LIST_1]));
-            ST32 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-          }
-        }
+        FillTemporalDirect8x8Mv (pCurLayer, iIdx8, pSubPartCount[i], pPartW[i], subMbType, ref, mvColoc, NULL, NULL);
       }
     }
   }
@@ -981,4 +895,231 @@
   }
 }
 
+void FillSpatialDirect8x8Mv (PDqLayer pCurLayer, const int16_t& iIdx8, const int8_t& iPartCount, const int8_t& iPartW,
+                             const SubMbType& subMbType, const bool& bIsLongRef, int16_t pMvDirect[LIST_A][2], int8_t iRef[LIST_A],
+                             int16_t pMotionVector[LIST_A][30][MV_A], int16_t pMvdCache[LIST_A][30][MV_A]) {
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
+  for (int32_t j = 0; j < iPartCount; j++) {
+    int8_t iPartIdx = iIdx8 + j * iPartW;
+    uint8_t iScan4Idx = g_kuiScan4[iPartIdx];
+    uint8_t iColocIdx = g_kuiScan4[iPartIdx];
+    uint8_t iCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+    int16_t pMV[4] = { 0 };
+    if (IS_SUB_8x8 (subMbType)) {
+      * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_0];
+      ST32 ((pMV + 2), LD32 (pMV));
+      ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
+      ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
+      ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
+      ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
+      if (pMotionVector != NULL) {
+        ST64 (pMotionVector[LIST_0][iCacheIdx], LD64 (pMV));
+        ST64 (pMotionVector[LIST_0][iCacheIdx + 6], LD64 (pMV));
+      }
+      if (pMvdCache != NULL) {
+        ST64 (pMvdCache[LIST_0][iCacheIdx], 0);
+        ST64 (pMvdCache[LIST_0][iCacheIdx + 6], 0);
+      }
+      * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_1];
+      ST32 ((pMV + 2), LD32 (pMV));
+      ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
+      ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
+      ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
+      ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
+      if (pMotionVector != NULL) {
+        ST64 (pMotionVector[LIST_1][iCacheIdx], LD64 (pMV));
+        ST64 (pMotionVector[LIST_1][iCacheIdx + 6], LD64 (pMV));
+      }
+      if (pMvdCache != NULL) {
+        ST64 (pMvdCache[LIST_1][iCacheIdx], 0);
+        ST64 (pMvdCache[LIST_1][iCacheIdx + 6], 0);
+      }
+    } else { //SUB_4x4
+      * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_0];
+      ST32 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (pMV));
+      ST32 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
+      if (pMotionVector != NULL) {
+        ST32 (pMotionVector[LIST_0][iCacheIdx], LD32 (pMV));
+      }
+      if (pMvdCache != NULL) {
+        ST32 (pMvdCache[LIST_0][iCacheIdx], 0);
+      }
+      * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_1];
+      ST32 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (pMV));
+      ST32 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
+      if (pMotionVector != NULL) {
+        ST32 (pMotionVector[LIST_1][iCacheIdx], LD32 (pMV));
+      }
+      if (pMvdCache != NULL) {
+        ST32 (pMvdCache[LIST_1][iCacheIdx], 0);
+      }
+    }
+    if ((* (int32_t*)pMvDirect[LIST_0] | * (int32_t*)pMvDirect[LIST_1])) {
+      uint32_t uiColZeroFlag = (0 == pCurLayer->iColocIntra[iColocIdx]) && !bIsLongRef &&
+                               (pCurLayer->iColocRefIndex[LIST_0][iColocIdx] == 0 || (pCurLayer->iColocRefIndex[LIST_0][iColocIdx] < 0
+                                   && pCurLayer->iColocRefIndex[LIST_1][iColocIdx] == 0));
+      const int16_t (*mvColoc)[2] = 0 == pCurLayer->iColocRefIndex[LIST_0][iColocIdx] ? pCurLayer->iColocMv[LIST_0] :
+                                    pCurLayer->iColocMv[LIST_1];
+      const int16_t* mv = mvColoc[iColocIdx];
+      if (IS_SUB_8x8 (subMbType)) {
+        if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
+          if (iRef[LIST_0] == 0) {
+            ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
+            ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], 0);
+            ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
+            ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
+            if (pMotionVector != NULL) {
+              ST64 (pMotionVector[LIST_0][iCacheIdx], 0);
+              ST64 (pMotionVector[LIST_0][iCacheIdx + 6], 0);
+            }
+            if (pMvdCache != NULL) {
+              ST64 (pMvdCache[LIST_0][iCacheIdx], 0);
+              ST64 (pMvdCache[LIST_0][iCacheIdx + 6], 0);
+            }
+          }
+
+          if (iRef[LIST_1] == 0) {
+            ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
+            ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], 0);
+            ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
+            ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
+            if (pMotionVector != NULL) {
+              ST64 (pMotionVector[LIST_1][iCacheIdx], 0);
+              ST64 (pMotionVector[LIST_1][iCacheIdx + 6], 0);
+            }
+            if (pMvdCache != NULL) {
+              ST64 (pMvdCache[LIST_1][iCacheIdx], 0);
+              ST64 (pMvdCache[LIST_1][iCacheIdx + 6], 0);
+            }
+          }
+        }
+      } else {
+        if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
+          if (iRef[LIST_0] == 0) {
+            ST32 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
+            ST32 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
+            if (pMotionVector != NULL) {
+              ST32 (pMotionVector[LIST_0][iCacheIdx], 0);
+            }
+            if (pMvdCache != NULL) {
+              ST32 (pMvdCache[LIST_0][iCacheIdx], 0);
+            }
+          }
+          if (iRef[LIST_1] == 0) {
+            ST32 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
+            ST32 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
+            if (pMotionVector != NULL) {
+              ST32 (pMotionVector[LIST_1][iCacheIdx], 0);
+            }
+            if (pMvdCache != NULL) {
+              ST32 (pMvdCache[LIST_1][iCacheIdx], 0);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void FillTemporalDirect8x8Mv (PDqLayer pCurLayer, const int16_t& iIdx8, const int8_t& iPartCount, const int8_t& iPartW,
+                              const SubMbType& subMbType, int8_t iRef[LIST_A], int16_t (*mvColoc)[2], int16_t pMotionVector[LIST_A][30][MV_A],
+                              int16_t pMvdCache[LIST_A][30][MV_A]) {
+  PSlice pSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
+  int32_t iMbXy = pCurLayer->iMbXyIndex;
+  int16_t pMvDirect[LIST_A][2] = { { 0, 0 }, { 0, 0 } };
+  for (int32_t j = 0; j < iPartCount; j++) {
+    int8_t iPartIdx = iIdx8 + j * iPartW;
+    uint8_t iScan4Idx = g_kuiScan4[iPartIdx];
+    uint8_t iColocIdx = g_kuiScan4[iPartIdx];
+    uint8_t iCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+    int16_t* mv = mvColoc[iColocIdx];
+
+    int16_t pMV[4] = { 0 };
+    if (IS_SUB_8x8 (subMbType)) {
+      if (!pCurLayer->iColocIntra[iColocIdx]) {
+        pMvDirect[LIST_0][0] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[0] + 128) >> 8;
+        pMvDirect[LIST_0][1] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[1] + 128) >> 8;
+      }
+      ST32 (pMV, LD32 (pMvDirect[LIST_0]));
+      ST32 ((pMV + 2), LD32 (pMvDirect[LIST_0]));
+      ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
+      ST64 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
+      ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
+      ST64 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
+      if (pMotionVector != NULL) {
+        ST64 (pMotionVector[LIST_0][iCacheIdx], LD64 (pMV));
+        ST64 (pMotionVector[LIST_0][iCacheIdx + 6], LD64 (pMV));
+      }
+      if (pMvdCache != NULL) {
+        ST64 (pMvdCache[LIST_0][iCacheIdx], 0);
+        ST64 (pMvdCache[LIST_0][iCacheIdx + 6], 0);
+      }
+      if (!pCurLayer->iColocIntra[g_kuiScan4[iIdx8]]) {
+        pMvDirect[LIST_1][0] = pMvDirect[LIST_0][0] - mv[0];
+        pMvDirect[LIST_1][1] = pMvDirect[LIST_0][1] - mv[1];
+      }
+      ST32 (pMV, LD32 (pMvDirect[LIST_1]));
+      ST32 ((pMV + 2), LD32 (pMvDirect[LIST_1]));
+      ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
+      ST64 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
+      ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
+      ST64 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
+      if (pMotionVector != NULL) {
+        ST64 (pMotionVector[LIST_1][iCacheIdx], LD64 (pMV));
+        ST64 (pMotionVector[LIST_1][iCacheIdx + 6], LD64 (pMV));
+      }
+      if (pMvdCache != NULL) {
+        ST64 (pMvdCache[LIST_1][iCacheIdx], 0);
+        ST64 (pMvdCache[LIST_1][iCacheIdx + 6], 0);
+      }
+    } else { //SUB_4x4
+      if (!pCurLayer->iColocIntra[iColocIdx]) {
+        pMvDirect[LIST_0][0] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[0] + 128) >> 8;
+        pMvDirect[LIST_0][1] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[1] + 128) >> 8;
+      }
+      ST32 (pCurLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (pMvDirect[LIST_0]));
+      ST32 (pCurLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
+      if (pMotionVector != NULL) {
+        ST32 (pMotionVector[LIST_0][iCacheIdx], LD32 (pMvDirect[LIST_0]));
+      }
+      if (pMvdCache != NULL) {
+        ST32 (pMvdCache[LIST_0][iCacheIdx], 0);
+      }
+      if (!pCurLayer->iColocIntra[iColocIdx]) {
+        pMvDirect[LIST_1][0] = pMvDirect[LIST_0][0] - mv[0];
+        pMvDirect[LIST_1][1] = pMvDirect[LIST_0][1] - mv[1];
+      }
+      ST32 (pCurLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (pMvDirect[LIST_1]));
+      ST32 (pCurLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
+      if (pMotionVector != NULL) {
+        ST32 (pMotionVector[LIST_1][iCacheIdx], LD32 (pMvDirect[LIST_1]));
+      }
+      if (pMvdCache != NULL) {
+        ST32 (pMvdCache[LIST_1][iCacheIdx], 0);
+      }
+    }
+  }
+}
+int8_t MapColToList0 (PWelsDecoderContext& pCtx, const int8_t& colocRefIndexL0,
+                      const int32_t& ref0Count) { //ISO/IEC 14496-10:2009(E) (8-193)
+  PPicture pic1 = pCtx->sRefPic.pRefList[LIST_1][0];
+  if (pic1 && pic1->pRefPic[LIST_0][colocRefIndexL0]) {
+    const int32_t iFramePoc = pic1->pRefPic[LIST_0][colocRefIndexL0]->iFramePoc;
+    for (int32_t i = 0; i < ref0Count; i++) {
+      if (pCtx->sRefPic.pRefList[LIST_0][i]->iFramePoc == iFramePoc) {
+        return i;
+      }
+    }
+  }
+  return 0;
+}
+void Update8x8RefIdx (PDqLayer& pCurDqLayer, const int16_t& iPartIdx, const int32_t& listIdx, const int8_t& iRef) {
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  const uint8_t iScan4Idx = g_kuiScan4[iPartIdx];
+  pCurDqLayer->pRefIndex[listIdx][iMbXy][iScan4Idx] = pCurDqLayer->pRefIndex[listIdx][iMbXy][iScan4Idx + 1] =
+        pCurDqLayer->pRefIndex[listIdx][iMbXy][iScan4Idx + 4] = pCurDqLayer->pRefIndex[listIdx][iMbXy][iScan4Idx + 5] = iRef;
+
+}
 } // namespace WelsDec
--- a/codec/decoder/core/src/parse_mb_syn_cabac.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cabac.cpp
@@ -739,9 +739,9 @@
   if (IS_DIRECT (mbType)) {
 
     int16_t pMvDirect[LIST_A][2] = { { 0, 0 }, { 0, 0 } };
+    SubMbType subMbType;
     if (pSliceHeader->iDirectSpatialMvPredFlag) {
       //predict direct spatial mv
-      SubMbType subMbType;
       int32_t ret = PredMvBDirectSpatial (pCtx, pMvDirect, iRef, subMbType);
       if (ret != ERR_NONE) {
         return ret;
@@ -748,8 +748,7 @@
       }
     } else {
       //temporal direct 16x16 mode
-      ComputeColocated (pCtx);
-      int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef);
+      int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef, subMbType);
       if (ret != ERR_NONE) {
         return ret;
       }
@@ -883,6 +882,8 @@
     uint32_t uiSubMbType;
     //sub_mb_type, partition
     int16_t pMvDirect[LIST_A][2] = { {0, 0}, {0, 0} };
+    bool bIsLongRef = pCtx->sRefPic.pRefList[LIST_1][0]->bIsLongRef;
+    const int32_t ref0Count = WELS_MIN (pSliceHeader->uiRefCount[LIST_0], pCtx->sRefPic.uiRefCount[LIST_0]);
     bool has_direct_called = false;
     SubMbType directSubMbType = 0;
     for (int32_t i = 0; i < 4; i++) {
@@ -908,8 +909,7 @@
 
           } else {
             //temporal direct mode
-            ComputeColocated (pCtx);
-            int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef);
+            int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef, directSubMbType);
             if (ret != ERR_NONE) {
               return ret;
             }
@@ -926,152 +926,32 @@
       }
     }
     for (int32_t i = 0; i < 4; i++) { //Direct 8x8 Ref and mv
-
       int16_t iIdx8 = i << 2;
       if (IS_DIRECT (pCurDqLayer->pSubMbType[iMbXy][i])) {
-
-        int8_t iPartCount = pSubPartCount[i];
-        int16_t iPartIdx, iBlockW = pPartW[i];
-        uint8_t iScan4Idx, iCacheIdx, iColocIdx;
-        iCacheIdx = g_kuiCache30ScanIdx[iIdx8];
-
-        if (!pSliceHeader->iDirectSpatialMvPredFlag) {
+        if (pSliceHeader->iDirectSpatialMvPredFlag) {
+          FillSpatialDirect8x8Mv (pCurDqLayer, iIdx8, pSubPartCount[i], pPartW[i], directSubMbType, bIsLongRef, pMvDirect, iRef,
+                                  pMotionVector, pMvdCache);
+        } else {
+          int16_t (*mvColoc)[2] = pCurDqLayer->iColocMv[LIST_0];
           iRef[LIST_1] = 0;
-          if (pCurDqLayer->iColocIntra[g_kuiScan4[iIdx8]]) {
+          iRef[LIST_0] = 0;
+          const uint8_t uiColoc4Idx = g_kuiScan4[iIdx8];
+          if (!pCurDqLayer->iColocIntra[uiColoc4Idx]) {
             iRef[LIST_0] = 0;
-          } else {
-            if (pCurDqLayer->iColocRefIndex[LIST_0][iIdx8] >= 0) {
-              iRef[LIST_0] = pCurDqLayer->iColocRefIndex[LIST_0][iIdx8];
+            int8_t colocRefIndexL0 = pCurDqLayer->iColocRefIndex[LIST_0][uiColoc4Idx];
+            if (colocRefIndexL0 >= 0) {
+              iRef[LIST_0] = MapColToList0 (pCtx, colocRefIndexL0, ref0Count);
             } else {
-              iRef[LIST_0] = pCurDqLayer->iColocRefIndex[LIST_1][iIdx8];
+              mvColoc = pCurDqLayer->iColocMv[LIST_1];
             }
           }
+          Update8x8RefIdx (pCurDqLayer, iIdx8, LIST_0, iRef[LIST_0]);
+          Update8x8RefIdx (pCurDqLayer, iIdx8, LIST_1, iRef[LIST_1]);
+          UpdateP8x8RefCacheIdxCabac (pRefIndex, iIdx8, LIST_0, iRef[LIST_0]);
+          UpdateP8x8RefCacheIdxCabac (pRefIndex, iIdx8, LIST_1, iRef[LIST_1]);
+          FillTemporalDirect8x8Mv (pCurDqLayer, iIdx8, pSubPartCount[i], pPartW[i], directSubMbType, iRef, mvColoc, pMotionVector,
+                                   pMvdCache);
         }
-        for (int32_t j = 0; j < iPartCount; j++) {
-          iPartIdx = iIdx8 + j * iBlockW;
-          iColocIdx = g_kuiScan4[iPartIdx];
-          iScan4Idx = g_kuiScan4[iPartIdx];
-          iCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-
-          if (pSliceHeader->iDirectSpatialMvPredFlag) {
-            int16_t pMV[4] = { 0 };
-            if (IS_SUB_8x8 (pCurDqLayer->pSubMbType[iMbXy][i])) {
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_0];
-              ST32 ((pMV + 2), LD32 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-              ST64 (pMotionVector[LIST_0][iCacheIdx], LD64 (pMV));
-              ST64 (pMotionVector[LIST_0][iCacheIdx + 6], LD64 (pMV));
-              ST64 (pMvdCache[LIST_0][iCacheIdx], 0);
-              ST64 (pMvdCache[LIST_0][iCacheIdx + 6], 0);
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_1];
-              ST32 ((pMV + 2), LD32 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-              ST64 (pMotionVector[LIST_1][iCacheIdx], LD64 (pMV));
-              ST64 (pMotionVector[LIST_1][iCacheIdx + 6], LD64 (pMV));
-              ST64 (pMvdCache[LIST_1][iCacheIdx], 0);
-              ST64 (pMvdCache[LIST_1][iCacheIdx + 6], 0);
-            } else { //SUB_4x4
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_0];
-              ST32 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (pMV));
-              ST32 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              ST32 (pMotionVector[LIST_0][iCacheIdx], LD32 (pMV));
-              ST32 (pMvdCache[LIST_0][iCacheIdx], 0);
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_1];
-              ST32 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (pMV));
-              ST32 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-              ST32 (pMotionVector[LIST_1][iCacheIdx], LD32 (pMV));
-              ST32 (pMvdCache[LIST_1][iCacheIdx], 0);
-            }
-
-            if ((* (int32_t*)pMvDirect[LIST_0] | * (int32_t*)pMvDirect[LIST_1])) {
-              bool bIsLongRef = pCtx->sRefPic.pRefList[LIST_1][0]->bIsLongRef;
-              uint32_t uiColZeroFlag = (0 == pCurDqLayer->iColocIntra[iColocIdx]) && !bIsLongRef &&
-                                       (pCurDqLayer->iColocRefIndex[LIST_0][iColocIdx] == 0 || (pCurDqLayer->iColocRefIndex[LIST_0][iColocIdx] < 0
-                                           && pCurDqLayer->iColocRefIndex[LIST_1][iColocIdx] == 0));
-              const int16_t (*mvColoc)[2] = pCurDqLayer->iColocRefIndex[LIST_0][iColocIdx] == 0 ? pCurDqLayer->iColocMv[LIST_0] :
-                                            pCurDqLayer->iColocMv[LIST_1];
-              const int16_t* mv = mvColoc[iColocIdx];
-              if (IS_SUB_8x8 (pCurDqLayer->pSubMbType[iMbXy][i])) {
-                if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
-                  if (iRef[LIST_0] == 0) {
-                    ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (pMotionVector[LIST_0][iCacheIdx], 0);
-                    ST64 (pMotionVector[LIST_0][iCacheIdx + 6], 0);
-                    ST64 (pMvdCache[LIST_0][iCacheIdx], 0);
-                    ST64 (pMvdCache[LIST_0][iCacheIdx + 6], 0);
-                  }
-
-                  if (iRef[LIST_1] == 0) {
-                    ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (pMotionVector[LIST_1][iCacheIdx], 0);
-                    ST64 (pMotionVector[LIST_1][iCacheIdx + 6], 0);
-                    ST64 (pMvdCache[LIST_1][iCacheIdx], 0);
-                    ST64 (pMvdCache[LIST_1][iCacheIdx + 6], 0);
-                  }
-                }
-              } else {
-                if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
-                  if (iRef[LIST_0] == 0) {
-                    ST32 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST32 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST32 (pMotionVector[LIST_0][iCacheIdx], 0);
-                    ST32 (pMvdCache[LIST_0][iCacheIdx], 0);
-                  }
-                  if (iRef[LIST_1] == 0) {
-                    ST32 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST32 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST32 (pMotionVector[LIST_1][iCacheIdx], 0);
-                    ST32 (pMvdCache[LIST_1][iCacheIdx], 0);
-                  }
-                }
-              }
-            }
-          } else {
-            int16_t (*mvColoc)[2] = pCurDqLayer->iColocMv[LIST_0];
-            int16_t* mv = mvColoc[iColocIdx];
-            int16_t pMV[4] = { 0 };
-            int16_t iMvp[LIST_A][2];
-            if (IS_SUB_8x8 (pCurDqLayer->pSubMbType[iMbXy][i])) {
-              iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[0] + 128) >> 8;
-              iMvp[LIST_0][1] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[1] + 128) >> 8;
-              ST32 (pMV, LD32 (iMvp[LIST_0]));
-              ST32 ((pMV + 2), LD32 (iMvp[LIST_0]));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-              iMvp[LIST_1][0] -= iMvp[LIST_0][0] - mv[0];
-              iMvp[LIST_1][1] -= iMvp[LIST_0][0] - mv[1];
-              ST32 (pMV, LD32 (iMvp[LIST_1]));
-              ST32 ((pMV + 2), LD32 (iMvp[LIST_1]));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-            } else { //SUB_4x4
-              iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[0] + 128) >> 8;
-              iMvp[LIST_0][1] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[1] + 128) >> 8;
-              ST32 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (iMvp[LIST_0]));
-              ST32 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              iMvp[LIST_1][0] -= iMvp[LIST_0][0] - mv[0];
-              iMvp[LIST_1][1] -= iMvp[LIST_0][0] - mv[1];
-              ST32 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (iMvp[LIST_1]));
-              ST32 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-            }
-          }
-        }
       }
     }
     //ref no-direct
@@ -1083,18 +963,8 @@
         int8_t iref = REF_NOT_IN_LIST;
         if (IS_DIRECT (subMbType)) {
           if (pSliceHeader->iDirectSpatialMvPredFlag) {
-            iref = iRef[listIdx];
-          } else {
-            iref = 0;
-            if (listIdx == LIST_0) {
-              if (!pCurDqLayer->iColocIntra[g_kuiScan4[iIdx8]]) {
-                if (pCurDqLayer->iColocRefIndex[LIST_0][iIdx8] >= 0) {
-                  iref = pCurDqLayer->iColocRefIndex[LIST_0][iIdx8];
-                } else {
-                  iref = pCurDqLayer->iColocRefIndex[LIST_1][iIdx8];
-                }
-              }
-            }
+            Update8x8RefIdx (pCurDqLayer, iIdx8, listIdx, iRef[listIdx]);
+            ref_idx_list[listIdx][i] = iRef[listIdx];
           }
           UpdateP8x8DirectCabac (pCurDqLayer, iIdx8);
         } else {
@@ -1114,29 +984,30 @@
             pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (pCtx->sRefPic.pRefList[listIdx][iref]
                                     && pCtx->sRefPic.pRefList[listIdx][iref]->bIsComplete);
           }
+          Update8x8RefIdx (pCurDqLayer, iIdx8, listIdx, iref);
+          ref_idx_list[listIdx][i] = iref;
         }
-        UpdateP8x8RefIdxCabac (pCurDqLayer, pRefIndex, iIdx8, iref, listIdx);
-        ref_idx_list[listIdx][i] = iref;
       }
     }
     //mv
     for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
       for (int32_t i = 0; i < 4; i++) {
-        int8_t iPartCount = pSubPartCount[i];
-        int16_t iPartIdx, iBlockW = pPartW[i];
-        uint8_t iScan4Idx, iCacheIdx;
+        int16_t iIdx8 = i << 2;
 
-        iCacheIdx = g_kuiCache30ScanIdx[i << 2];
+        uint32_t subMbType = pCurDqLayer->pSubMbType[iMbXy][i];
+        if (IS_DIRECT (subMbType) && !pSliceHeader->iDirectSpatialMvPredFlag)
+          continue;
 
         int8_t iref = ref_idx_list[listIdx][i];
-        pRefIndex[listIdx][iCacheIdx] = pRefIndex[listIdx][iCacheIdx + 1]
-                                        = pRefIndex[listIdx][iCacheIdx + 6] = pRefIndex[listIdx][iCacheIdx + 7] = iref;
+        UpdateP8x8RefCacheIdxCabac (pRefIndex, iIdx8, listIdx, iref);
 
-        uint32_t subMbType = pCurDqLayer->pSubMbType[iMbXy][i];
-        if (IS_DIRECT (subMbType)) {
+        if (IS_DIRECT (subMbType))
           continue;
-        }
+
         bool is_dir = IS_DIR (subMbType, 0, listIdx) > 0;
+        int8_t iPartCount = pSubPartCount[i];
+        int16_t iBlockW = pPartW[i];
+        uint8_t iScan4Idx, iCacheIdx;
         for (int32_t j = 0; j < iPartCount; j++) {
           iPartIdx = (i << 2) + j * iBlockW;
           iScan4Idx = g_kuiScan4[iPartIdx];
@@ -1668,5 +1539,11 @@
   WELS_READ_VERIFY (InitReadBits (pBsAux, 1));
   WELS_READ_VERIFY (InitCabacDecEngineFromBS (pCabacDecEngine, pBsAux));
   return ERR_NONE;
+}
+void    UpdateP8x8RefCacheIdxCabac (int8_t pRefIndex[LIST_A][30], const int16_t& iPartIdx,
+                                    const int32_t& listIdx, const int8_t& iRef) {
+  const uint8_t uiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+  pRefIndex[listIdx][uiCacheIdx] = pRefIndex[listIdx][uiCacheIdx + 1] = pRefIndex[listIdx][uiCacheIdx + 6] =
+                                     pRefIndex[listIdx][uiCacheIdx + 7] = iRef;
 }
 }
--- a/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
@@ -1349,9 +1349,9 @@
   if (IS_DIRECT (mbType)) {
 
     int16_t pMvDirect[LIST_A][2] = { { 0, 0 }, { 0, 0 } };
+    SubMbType subMbType;
     if (pSliceHeader->iDirectSpatialMvPredFlag) {
       //predict direct spatial mv
-      SubMbType subMbType;
       int32_t ret = PredMvBDirectSpatial (pCtx, pMvDirect, iRef, subMbType);
       if (ret != ERR_NONE) {
         return ret;
@@ -1358,8 +1358,7 @@
       }
     } else {
       //temporal direct 16x16 mode
-      ComputeColocated (pCtx);
-      int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef);
+      int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef, subMbType);
       if (ret != ERR_NONE) {
         return ret;
       }
@@ -1369,7 +1368,7 @@
       for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
         if (IS_DIR (mbType, 0, listIdx)) {
           WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //motion_prediction_flag_l0/l1[ mbPartIdx ]
-          iMotionPredFlag[listIdx][0] = uiCode > 0;
+          iMotionPredFlag[listIdx][0] = uiCode;
         }
       }
     }
@@ -1417,7 +1416,7 @@
         for (int32_t i = 0; i < 2; ++i) {
           if (IS_DIR (mbType, i, listIdx)) {
             WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //motion_prediction_flag_l0/l1[ mbPartIdx ]
-            iMotionPredFlag[listIdx][i] = uiCode > 0;
+            iMotionPredFlag[listIdx][i] = uiCode;
           }
         }
       }
@@ -1474,7 +1473,7 @@
         for (int32_t i = 0; i < 2; ++i) {
           if (IS_DIR (mbType, i, listIdx)) {
             WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //motion_prediction_flag_l0/l1[ mbPartIdx ]
-            iMotionPredFlag[listIdx][i] = uiCode > 0;
+            iMotionPredFlag[listIdx][i] = uiCode;
           }
         }
       }
@@ -1529,6 +1528,8 @@
     uint32_t uiSubMbType;
     //sub_mb_type, partition
     int16_t pMvDirect[LIST_A][2] = { { 0, 0 }, { 0, 0 } };
+    bool bIsLongRef = pCtx->sRefPic.pRefList[LIST_1][0]->bIsLongRef;
+    const int32_t ref0Count = WELS_MIN (pSliceHeader->uiRefCount[LIST_0], pCtx->sRefPic.uiRefCount[LIST_0]);
     bool has_direct_called = false;
     SubMbType directSubMbType = 0;
 
@@ -1556,8 +1557,7 @@
 
           } else {
             //temporal direct mode
-            ComputeColocated (pCtx);
-            int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef);
+            int32_t ret = PredBDirectTemporal (pCtx, pMvDirect, iRef, directSubMbType);
             if (ret != ERR_NONE) {
               return ret;
             }
@@ -1579,7 +1579,7 @@
           bool is_dir = IS_DIR (pCurDqLayer->pSubMbType[iMbXy][i], 0, listIdx) > 0;
           if (is_dir) {
             WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //motion_prediction_flag_l0[ mbPartIdx ]
-            iMotionPredFlag[listIdx][i] = uiCode > 0;
+            iMotionPredFlag[listIdx][i] = uiCode;
           }
         }
       }
@@ -1587,136 +1587,28 @@
     for (int32_t i = 0; i < 4; i++) { //Direct 8x8 Ref and mv
       int16_t iIdx8 = i << 2;
       if (IS_DIRECT (pCurDqLayer->pSubMbType[iMbXy][i])) {
-        int8_t iPartCount = pSubPartCount[i];
-        int16_t iPartIdx, iBlockW = pPartW[i];
-        uint8_t iScan4Idx, iCacheIdx, iColocIdx;
-        iCacheIdx = g_kuiCache30ScanIdx[iIdx8];
-
-        if (!pSliceHeader->iDirectSpatialMvPredFlag) {
+        if (pSliceHeader->iDirectSpatialMvPredFlag) {
+          FillSpatialDirect8x8Mv (pCurDqLayer, iIdx8, pSubPartCount[i], pPartW[i], directSubMbType, bIsLongRef, pMvDirect, iRef,
+                                  iMvArray, NULL);
+        } else {
+          int16_t (*mvColoc)[2] = pCurDqLayer->iColocMv[LIST_0];
           iRef[LIST_1] = 0;
-          if (pCurDqLayer->iColocIntra[g_kuiScan4[iIdx8]]) {
+          iRef[LIST_0] = 0;
+          const uint8_t uiColoc4Idx = g_kuiScan4[iIdx8];
+          if (!pCurDqLayer->iColocIntra[uiColoc4Idx]) {
             iRef[LIST_0] = 0;
-          } else {
-            if (pCurDqLayer->iColocRefIndex[LIST_0][iIdx8] >= 0) {
-              iRef[LIST_0] = pCurDqLayer->iColocRefIndex[LIST_0][iIdx8];
+            int8_t colocRefIndexL0 = pCurDqLayer->iColocRefIndex[LIST_0][uiColoc4Idx];
+            if (colocRefIndexL0 >= 0) {
+              iRef[LIST_0] = MapColToList0 (pCtx, colocRefIndexL0, ref0Count);
             } else {
-              iRef[LIST_0] = pCurDqLayer->iColocRefIndex[LIST_1][iIdx8];
+              mvColoc = pCurDqLayer->iColocMv[LIST_1];
             }
           }
+          Update8x8RefIdx (pCurDqLayer, iIdx8, LIST_0, iRef[LIST_0]);
+          Update8x8RefIdx (pCurDqLayer, iIdx8, LIST_1, iRef[LIST_1]);
+          FillTemporalDirect8x8Mv (pCurDqLayer, iIdx8, pSubPartCount[i], pPartW[i], directSubMbType, iRef, mvColoc, iMvArray,
+                                   NULL);
         }
-        for (int32_t j = 0; j < iPartCount; j++) {
-          iPartIdx = iIdx8 + j * iBlockW;
-          iColocIdx = g_kuiScan4[iPartIdx];
-          iScan4Idx = g_kuiScan4[iPartIdx];
-          iCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
-
-          if (pSliceHeader->iDirectSpatialMvPredFlag) {
-            int16_t pMV[4] = { 0 };
-            if (IS_SUB_8x8 (pCurDqLayer->pSubMbType[iMbXy][i])) {
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_0];
-              ST32 ((pMV + 2), LD32 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-              ST64 (iMvArray[LIST_0][iCacheIdx], LD64 (pMV));
-              ST64 (iMvArray[LIST_0][iCacheIdx + 6], LD64 (pMV));
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_1];
-              ST32 ((pMV + 2), LD32 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-              ST64 (iMvArray[LIST_1][iCacheIdx], LD64 (pMV));
-              ST64 (iMvArray[LIST_1][iCacheIdx + 6], LD64 (pMV));
-            } else { //SUB_4x4
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_0];
-              ST32 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (pMV));
-              ST32 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              ST32 (iMvArray[LIST_0][iCacheIdx], LD32 (pMV));
-              * (uint32_t*)pMV = * (uint32_t*)pMvDirect[LIST_1];
-              ST32 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (pMV));
-              ST32 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-              ST32 (iMvArray[LIST_1][iCacheIdx], LD32 (pMV));
-            }
-
-            if ((* (int32_t*)pMvDirect[LIST_0] | * (int32_t*)pMvDirect[LIST_1])) {
-              bool bIsLongRef = pCtx->sRefPic.pRefList[LIST_1][0]->bIsLongRef;
-              uint32_t uiColZeroFlag = (0 == pCurDqLayer->iColocIntra[iColocIdx]) && !bIsLongRef &&
-                                       (pCurDqLayer->iColocRefIndex[LIST_0][iColocIdx] == 0 || (pCurDqLayer->iColocRefIndex[LIST_0][iColocIdx] < 0
-                                           && pCurDqLayer->iColocRefIndex[LIST_1][iColocIdx] == 0));
-              const int16_t (*mvColoc)[2] = pCurDqLayer->iColocRefIndex[LIST_0][iColocIdx] == 0 ? pCurDqLayer->iColocMv[LIST_0] :
-                                            pCurDqLayer->iColocMv[LIST_1];
-              const int16_t* mv = mvColoc[iColocIdx];
-              if (IS_SUB_8x8 (pCurDqLayer->pSubMbType[iMbXy][i])) {
-                if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
-                  if (iRef[LIST_0] == 0) {
-                    ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (iMvArray[LIST_0][iCacheIdx], 0);
-                    ST64 (iMvArray[LIST_0][iCacheIdx + 6], 0);
-                  }
-
-                  if (iRef[LIST_1] == 0) {
-                    ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-                    ST64 (iMvArray[LIST_1][iCacheIdx], 0);
-                    ST64 (iMvArray[LIST_1][iCacheIdx + 6], 0);
-                  }
-                }
-              } else {
-                if (uiColZeroFlag && ((unsigned) (mv[0] + 1) <= 2 && (unsigned) (mv[1] + 1) <= 2)) {
-                  if (iRef[LIST_0] == 0) {
-                    ST32 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST32 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-                    ST32 (iMvArray[LIST_0][iCacheIdx], 0);
-                  }
-                  if (iRef[LIST_1] == 0) {
-                    ST32 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST32 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-                    ST32 (iMvArray[LIST_1][iCacheIdx], 0);
-                  }
-                }
-              }
-            }
-          } else {
-            int16_t (*mvColoc)[2] = pCurDqLayer->iColocMv[LIST_0];
-            int16_t* mv = mvColoc[iColocIdx];
-            int16_t pMV[4] = { 0 };
-            int16_t iMvp[LIST_A][2];
-            if (IS_SUB_8x8 (pCurDqLayer->pSubMbType[iMbXy][i])) {
-              iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[0] + 128) >> 8;
-              iMvp[LIST_0][1] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[1] + 128) >> 8;
-              ST32 (pMV, LD32 (iMvp[LIST_0]));
-              ST32 ((pMV + 2), LD32 (iMvp[LIST_0]));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx + 4], 0);
-              iMvp[LIST_1][0] -= iMvp[LIST_0][0] - mv[0];
-              iMvp[LIST_1][1] -= iMvp[LIST_0][0] - mv[1];
-              ST32 (pMV, LD32 (iMvp[LIST_1]));
-              ST32 ((pMV + 2), LD32 (iMvp[LIST_1]));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD64 (pMV));
-              ST64 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx + 4], LD64 (pMV));
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-              ST64 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx + 4], 0);
-            } else { //SUB_4x4
-              iMvp[LIST_0][0] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[0] + 128) >> 8;
-              iMvp[LIST_0][1] = (pSlice->iMvScale[LIST_0][iRef[LIST_0]] * mv[1] + 128) >> 8;
-              ST32 (pCurDqLayer->pMv[LIST_0][iMbXy][iScan4Idx], LD32 (iMvp[LIST_0]));
-              ST32 (pCurDqLayer->pMvd[LIST_0][iMbXy][iScan4Idx], 0);
-              iMvp[LIST_1][0] -= iMvp[LIST_0][0] - mv[0];
-              iMvp[LIST_1][1] -= iMvp[LIST_0][0] - mv[1];
-              ST32 (pCurDqLayer->pMv[LIST_1][iMbXy][iScan4Idx], LD32 (iMvp[LIST_1]));
-              ST32 (pCurDqLayer->pMvd[LIST_1][iMbXy][iScan4Idx], 0);
-            }
-          }
-        }
       }
     }
     //ref no-direct
@@ -1723,23 +1615,12 @@
     for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
       for (int32_t i = 0; i < 4; i++) {
         int16_t iIdx8 = i << 2;
-        uint8_t uiScan4Idx = g_kuiScan4[iIdx8];
         int32_t subMbType = pCurDqLayer->pSubMbType[iMbXy][i];
         int8_t iref = REF_NOT_IN_LIST;
         if (IS_DIRECT (subMbType)) {
           if (pSliceHeader->iDirectSpatialMvPredFlag) {
-            iref = iRef[listIdx];
-          } else {
-            iref = 0;
-            if (listIdx == LIST_0) {
-              if (!pCurDqLayer->iColocIntra[g_kuiScan4[iIdx8]]) {
-                if (pCurDqLayer->iColocRefIndex[LIST_0][iIdx8] >= 0) {
-                  iref = pCurDqLayer->iColocRefIndex[LIST_0][iIdx8];
-                } else {
-                  iref = pCurDqLayer->iColocRefIndex[LIST_1][iIdx8];
-                }
-              }
-            }
+            Update8x8RefIdx (pCurDqLayer, iIdx8, listIdx, iRef[listIdx]);
+            ref_idx_list[listIdx][i] = iRef[listIdx];
           }
         } else {
           if (IS_DIR (subMbType, 0, listIdx)) {
@@ -1762,10 +1643,9 @@
               return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
             }
           }
+          Update8x8RefIdx (pCurDqLayer, iIdx8, listIdx, iref);
+          ref_idx_list[listIdx][i] = iref;
         }
-        pCurDqLayer->pRefIndex[listIdx][iMbXy][uiScan4Idx] = pCurDqLayer->pRefIndex[listIdx][iMbXy][uiScan4Idx + 1] =
-              pCurDqLayer->pRefIndex[listIdx][iMbXy][uiScan4Idx + 4] = pCurDqLayer->pRefIndex[listIdx][iMbXy][uiScan4Idx + 5] = iref;
-        ref_idx_list[listIdx][i] = iref;
       }
     }
     //mv
--- a/codec/decoder/plus/src/welsDecoderExt.cpp
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -744,6 +744,13 @@
 
 DECODING_STATE CWelsDecoder::ReorderPicturesInDisplay (unsigned char** ppDst, SBufferInfo* pDstInfo) {
   if (pDstInfo->iBufferStatus == 1 && m_pDecContext->pSps->uiProfileIdc != 66) {
+    //Non-reference B_FRAME (disposable) must be released and must not be buffered because its buffer could be overwritten by next reference picture.
+    if (m_pDecContext->pSliceHeader->eSliceType == B_SLICE && !m_pDecContext->pSliceHeader->bIsRefPic) {
+      if (m_pDecContext->pSliceHeader->iPicOrderCntLsb - m_LastWrittenPOC <= 2) {
+        m_LastWrittenPOC = m_pDecContext->pSliceHeader->iPicOrderCntLsb;
+        return dsErrorFree;
+      }
+    }
     if (m_pDecContext->pSliceHeader->iPicOrderCntLsb == 0) {
       if (m_iNumOfPicts > 0) {
         m_iLastGOPRemainPicts = m_iNumOfPicts;