shithub: openh264

Download patch

ref: dac13639c9404333300f0962f2fe1dc53ae5dbc1
parent: 15b7bc22c32881d2a4fcc95a39aa8a4bc82ecf26
parent: e9916c7592efd5dde18895fd35b28016ea3d0127
author: huili2 <huili2@cisco.com>
date: Tue Apr 21 05:42:49 EDT 2015

Merge pull request #1908 from HaiboZhu/Add_Highprofile

Add high profile support

--- a/codec/common/inc/wels_common_defs.h
+++ b/codec/common/inc/wels_common_defs.h
@@ -65,9 +65,12 @@
 extern const uint8_t g_kuiCache30ScanIdx[16];
 extern const uint8_t g_kuiCache48CountScan4Idx[24];
 
+extern const uint8_t g_kuiMatrixV[6][8][8];
+
 extern const uint8_t g_kuiDequantScaling4x4Default[2][16];
 extern const uint8_t g_kuiDequantScaling8x8Default[2][64];
-extern const  ALIGNED_DECLARE (uint16_t, g_kuiDequantCoeff[52][8], 16);
+extern const ALIGNED_DECLARE (uint16_t, g_kuiDequantCoeff[52][8], 16);
+extern const ALIGNED_DECLARE (uint16_t, g_kuiDequantCoeff8x8[52][64], 16);
 extern const uint8_t g_kuiChromaQpTable[52];
 
 extern const uint8_t g_kuiCabacRangeLps[64][4];
@@ -279,6 +282,8 @@
 #define MB_TYPE_INTRA     (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA8x8 | MB_TYPE_INTRA_PCM)
 #define MB_TYPE_INTER     (MB_TYPE_16x16 | MB_TYPE_16x8 | MB_TYPE_8x16 | MB_TYPE_8x8 | MB_TYPE_8x8_REF0 | MB_TYPE_SKIP)
 #define IS_INTRA4x4(type) ( MB_TYPE_INTRA4x4 == (type) )
+#define IS_INTRA8x8(type) ( MB_TYPE_INTRA8x8 == (type) )
+#define IS_INTRANxN(type) ( MB_TYPE_INTRA4x4 == (type) || MB_TYPE_INTRA8x8 == (type) )
 #define IS_INTRA16x16(type) ( MB_TYPE_INTRA16x16 == (type) )
 #define IS_INTRA(type) ( (type)&MB_TYPE_INTRA )
 #define IS_INTER(type) ( (type)&MB_TYPE_INTER )
@@ -304,6 +309,7 @@
 #define I16_PRED_DC_128  6
 #define I16_PRED_DC_A  7
 //////////intra4x4   Luma
+// Here, I8x8 also use these definitions
 #define I4_PRED_INVALID    0
 #define I4_PRED_V        0
 #define I4_PRED_H        1
--- a/codec/common/src/common_tables.cpp
+++ b/codec/common/src/common_tables.cpp
@@ -37,12 +37,12 @@
 
 //////pNonZeroCount[16+8] mapping scan index
 const uint8_t g_kuiMbCountScan4Idx[24] = {
-  //  0   1 | 4  5      luma 8*8 block           pNonZeroCount[16+8]
+                   //  0   1 | 4  5      luma 8*8 block           pNonZeroCount[16+8]
   0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3
   2,  3,  6,  7,   //---------------      ---------                 4   5   6   7
   8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11
-  10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15
-  16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19
+  10, 11, 14, 15,  // 10  11 | 14 15-----------------------------> 12  13  14  15
+  16, 17, 20, 21,  //----------------    chroma 8*8 block          16  17  18  19
   18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23
 };
 
@@ -61,6 +61,68 @@
   46, 47,			// 6+5*8, 7+5*8,
 };
 
+const uint8_t g_kuiMatrixV[6][8][8] = { // generated from equation 8-317, 8-318
+  {
+    {20, 19, 25, 19, 20, 19, 25, 19},
+    {19, 18, 24, 18, 19, 18, 24, 18},
+    {25, 24, 32, 24, 25, 24, 32, 24},
+    {19, 18, 24, 18, 19, 18, 24, 18},
+    {20, 19, 25, 19, 20, 19, 25, 19},
+    {19, 18, 24, 18, 19, 18, 24, 18},
+    {25, 24, 32, 24, 25, 24, 32, 24},
+    {19, 18, 24, 18, 19, 18, 24, 18}
+  },
+  {
+    {22, 21, 28, 21, 22, 21, 28, 21},
+    {21, 19, 26, 19, 21, 19, 26, 19},
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {21, 19, 26, 19, 21, 19, 26, 19},
+    {22, 21, 28, 21, 22, 21, 28, 21},
+    {21, 19, 26, 19, 21, 19, 26, 19},
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {21, 19, 26, 19, 21, 19, 26, 19}
+  },
+  {
+    {26, 24, 33, 24, 26, 24, 33, 24},
+    {24, 23, 31, 23, 24, 23, 31, 23},
+    {33, 31, 42, 31, 33, 31, 42, 31},
+    {24, 23, 31, 23, 24, 23, 31, 23},
+    {26, 24, 33, 24, 26, 24, 33, 24},
+    {24, 23, 31, 23, 24, 23, 31, 23},
+    {33, 31, 42, 31, 33, 31, 42, 31},
+    {24, 23, 31, 23, 24, 23, 31, 23}
+  },
+  {
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {26, 25, 33, 25, 26, 25, 33, 25},
+    {35, 33, 45, 33, 35, 33, 45, 33},
+    {26, 25, 33, 25, 26, 25, 33, 25},
+    {28, 26, 35, 26, 28, 26, 35, 26},
+    {26, 25, 33, 25, 26, 25, 33, 25},
+    {35, 33, 45, 33, 35, 33, 45, 33},
+    {26, 25, 33, 25, 26, 25, 33, 25}
+  },
+  {
+    {32, 30, 40, 30, 32, 30, 40, 30},
+    {30, 28, 38, 28, 30, 28, 38, 28},
+    {40, 38, 51, 38, 40, 38, 51, 38},
+    {30, 28, 38, 28, 30, 28, 38, 28},
+    {32, 30, 40, 30, 32, 30, 40, 30},
+    {30, 28, 38, 28, 30, 28, 38, 28},
+    {40, 38, 51, 38, 40, 38, 51, 38},
+    {30, 28, 38, 28, 30, 28, 38, 28}
+  },
+  {
+    {36, 34, 46, 34, 36, 34, 46, 34},
+    {34, 32, 43, 32, 34, 32, 43, 32},
+    {46, 43, 58, 43, 46, 43, 58, 43},
+    {34, 32, 43, 32, 34, 32, 43, 32},
+    {36, 34, 46, 34, 36, 34, 46, 34},
+    {34, 32, 43, 32, 34, 32, 43, 32},
+    {46, 43, 58, 43, 46, 43, 58, 43},
+    {34, 32, 43, 32, 34, 32, 43, 32}
+  }
+};
 
 //cache element equal to 30
 const uint8_t g_kuiCache30ScanIdx[16] = { //mv or uiRefIndex cache scan index, 4*4 block as basic unit
@@ -170,6 +232,113 @@
   /*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
   /*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
   /*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
+};
+
+ALIGNED_DECLARE (const uint16_t, g_kuiDequantCoeff8x8[52][64], 16) = {
+/* QP ==  0 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP ==  1 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP ==  2 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP ==  3 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP ==  4 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP ==  5 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP ==  6 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP ==  7 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP ==  8 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP ==  9 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 10 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 11 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 12 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 13 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 14 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 15 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 16 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 17 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 18 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 19 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 20 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 21 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 22 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 23 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 24 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 25 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 26 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 27 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 28 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 29 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 30 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 31 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 32 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 33 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 34 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 35 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 36 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 37 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 38 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 39 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 40 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 41 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 42 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 43 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 44 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 45 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
+/* QP == 46 */
+{ 512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448,  512,  480,  640,  480,  512,  480,  640,  480,  480,  448,  608,  448,  480,  448,  608,  448,  640,  608,  816,  608,  640,  608,  816,  608,  480,  448,  608,  448,  480,  448,  608,  448 },
+/* QP == 47 */
+{ 576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512,  576,  544,  736,  544,  576,  544,  736,  544,  544,  512,  688,  512,  544,  512,  688,  512,  736,  688,  928,  688,  736,  688,  928,  688,  544,  512,  688,  512,  544,  512,  688,  512 },
+/* QP == 48 */
+{ 320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288,  320,  304,  400,  304,  320,  304,  400,  304,  304,  288,  384,  288,  304,  288,  384,  288,  400,  384,  512,  384,  400,  384,  512,  384,  304,  288,  384,  288,  304,  288,  384,  288 },
+/* QP == 49 */
+{ 352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304,  352,  336,  448,  336,  352,  336,  448,  336,  336,  304,  416,  304,  336,  304,  416,  304,  448,  416,  560,  416,  448,  416,  560,  416,  336,  304,  416,  304,  336,  304,  416,  304 },
+/* QP == 50 */
+{ 416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368,  416,  384,  528,  384,  416,  384,  528,  384,  384,  368,  496,  368,  384,  368,  496,  368,  528,  496,  672,  496,  528,  496,  672,  496,  384,  368,  496,  368,  384,  368,  496,  368 },
+/* QP == 51 */
+{ 448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400,  448,  416,  560,  416,  448,  416,  560,  416,  416,  400,  528,  400,  416,  400,  528,  400,  560,  528,  720,  528,  560,  528,  720,  528,  416,  400,  528,  400,  416,  400,  528,  400 },
 };
 
 // table A-1 - Level limits
--- a/codec/decoder/core/inc/dec_frame.h
+++ b/codec/decoder/core/inc/dec_frame.h
@@ -68,10 +68,12 @@
   int16_t	(*pMv[LIST_A])[MB_BLOCK4x4_NUM][MV_A];
   int16_t	(*pMvd[LIST_A])[MB_BLOCK4x4_NUM][MV_A];
   int8_t	(*pRefIndex[LIST_A])[MB_BLOCK4x4_NUM];
+  bool*    pNoSubMbPartSizeLessThan8x8Flag;
+  bool*    pTransformSize8x8Flag;
   int8_t*  pLumaQp;
   int8_t  (*pChromaQp)[2];
   int8_t*  pCbp;
-  uint8_t *pCbfDc;
+  uint16_t *pCbfDc;
   int8_t (*pNzc)[24];
   int8_t (*pNzcRs)[24];
   int8_t*  pResidualPredFlag;
@@ -81,6 +83,7 @@
   int16_t (*pScaledTCoeff)[MB_COEFF_LIST_SIZE];
   int8_t (*pIntraPredMode)[8];  //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
   int8_t (*pIntra4x4FinalMode)[MB_BLOCK4x4_NUM];
+  uint8_t  *pIntraNxNAvailFlag;
   int8_t*  pChromaPredMode;
   //uint8_t (*motion_pred_flag[LIST_A])[MB_PARTITION_SIZE]; // 8x8
   int8_t (*pSubMbType)[MB_SUB_PARTITION_SIZE];
@@ -132,7 +135,6 @@
   int8_t*					pCbp;
   int8_t	(*pNzc)[24];
   int8_t	(*pIntraPredMode)[8];     //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
-
   int32_t					iMbX;
   int32_t					iMbY;
   int32_t					iMbXyIndex;
--- a/codec/decoder/core/inc/dec_golomb.h
+++ b/codec/decoder/core/inc/dec_golomb.h
@@ -232,6 +232,18 @@
 
   return 0;
 }
+
+/*
+ *      Check whether there is more rbsp data for processing
+ */
+static inline bool CheckMoreRBSPData(PBitStringAux pBsAux) {
+  if ((pBsAux->iBits - ((pBsAux->pCurBuf - pBsAux->pStartBuf - 2) << 3) - pBsAux->iLeftBits) > 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 //define macros to check syntax elements
 #define WELS_CHECK_SE_BOTH_ERROR(val, lower_bound, upper_bound, syntax_name, ret_code) do {\
 if ((val < lower_bound) || (val > upper_bound)) {\
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -39,6 +39,7 @@
 namespace WelsDec {
 
 void IdctResAddPred_c (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+void IdctResAddPred8x8_c (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
 
 #if defined(__cplusplus)
 extern "C" {
--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -91,8 +91,15 @@
 #define NEW_CTX_OFFSET_LAST 166
 #define NEW_CTX_OFFSET_ONE 227
 #define NEW_CTX_OFFSET_ABS 232
+#define NEW_CTX_OFFSET_TS_8x8_FLAG 399
 #define CTX_NUM_MVD 7
 #define CTX_NUM_CBP 4
+// Table 9-34 in Page 270
+#define NEW_CTX_OFFSET_TRANSFORM_SIZE_8X8_FLAG  399
+#define NEW_CTX_OFFSET_MAP_8x8  402
+#define NEW_CTX_OFFSET_LAST_8x8 417
+#define NEW_CTX_OFFSET_ONE_8x8  426
+#define NEW_CTX_OFFSET_ABS_8x8  431 // Puzzle, where is the definition?
 
 typedef struct TagDataBuffer {
   uint8_t* pHead;
@@ -131,6 +138,8 @@
 typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicWidth,
                                     const int32_t kiPicHeight);
 
+typedef void (*PGetIntraPred8x8Func) (uint8_t* pPred, const int32_t kiLumaStride, bool bTLAvail, bool bTRAvail);
+
 /**/
 typedef struct TagRefPic {
   PPicture			pRefList[LIST_A][MAX_REF_PIC_COUNT];	// reference picture marking plus FIFO scheme
@@ -262,15 +271,18 @@
     int16_t*  pMbType[LAYER_NUM_EXCHANGEABLE];                      /* mb type */
     int16_t	(*pMv[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM][MV_A]; //[LAYER_NUM_EXCHANGEABLE   MB_BLOCK4x4_NUM*]
     int8_t	(*pRefIndex[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM];
+    bool*   pNoSubMbPartSizeLessThan8x8Flag[LAYER_NUM_EXCHANGEABLE];
+    bool*   pTransformSize8x8Flag[LAYER_NUM_EXCHANGEABLE];
     int8_t*	pLumaQp[LAYER_NUM_EXCHANGEABLE];	/*mb luma_qp*/
     int8_t	(*pChromaQp[LAYER_NUM_EXCHANGEABLE])[2];					/*mb chroma_qp*/
     int16_t	(*pMvd[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM][MV_A]; //[LAYER_NUM_EXCHANGEABLE   MB_BLOCK4x4_NUM*]
-    uint8_t* pCbfDc[LAYER_NUM_EXCHANGEABLE];
+    uint16_t* pCbfDc[LAYER_NUM_EXCHANGEABLE];
     int8_t	(*pNzc[LAYER_NUM_EXCHANGEABLE])[24];
     int8_t	(*pNzcRs[LAYER_NUM_EXCHANGEABLE])[24];
     int16_t (*pScaledTCoeff[LAYER_NUM_EXCHANGEABLE])[MB_COEFF_LIST_SIZE]; /*need be aligned*/
     int8_t	(*pIntraPredMode[LAYER_NUM_EXCHANGEABLE])[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
     int8_t (*pIntra4x4FinalMode[LAYER_NUM_EXCHANGEABLE])[MB_BLOCK4x4_NUM];
+    uint8_t*  pIntraNxNAvailFlag[LAYER_NUM_EXCHANGEABLE];
     int8_t*  pChromaPredMode[LAYER_NUM_EXCHANGEABLE];
     int8_t*  pCbp[LAYER_NUM_EXCHANGEABLE];
     uint8_t (*pMotionPredFlag[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_PARTITION_SIZE]; // 8x8
@@ -284,7 +296,6 @@
     uint32_t iMbHeight;
   } sMb;
 
-
 // reconstruction picture
   PPicture			pDec;			//pointer to current picture being reconstructed
 
@@ -381,6 +392,9 @@
   PGetIntraPredFunc pGetIChromaPredFunc[7];		// h264_predict_8x8_t
   PIdctResAddPredFunc	pIdctResAddPredFunc;
   SMcFunc				sMcFunc;
+  //Transform8x8
+  PGetIntraPred8x8Func pGetI8x8LumaPredFunc[14];
+  PIdctResAddPredFunc	pIdctResAddPredFunc8x8;
 
 //For error concealment
   SCopyFunc sCopyFunc;
@@ -395,8 +409,8 @@
   int32_t iCurSeqIntervalMaxPicWidth;
   int32_t iCurSeqIntervalMaxPicHeight;
 
-  PWelsFillNeighborMbInfoIntra4x4Func  pFillInfoCacheIntra4x4Func;
-  PWelsMapNeighToSample pMap4x4NeighToSampleFunc;
+  PWelsFillNeighborMbInfoIntra4x4Func  pFillInfoCacheIntraNxNFunc;
+  PWelsMapNeighToSample pMapNxNNeighToSampleFunc;
   PWelsMap16NeighToSample pMap16x16NeighToSampleFunc;
 
 //feedback whether or not have VCL in current AU, and the temporal ID
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@@ -60,6 +60,21 @@
 void WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride);
 void WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride);
 
+void WelsI8x8LumaPredV_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredH_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDc_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredVL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredVR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredHU_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+void WelsI8x8LumaPredHD_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail);
+
 void WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride);
 void WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride);
 void WelsIChromaPredPlane_c (uint8_t* pPred, const int32_t kiStride);
@@ -95,8 +110,6 @@
 void WelsDecoderIChromaPredV_mmx (uint8_t* pPred, const int32_t kiStride);
 void WelsDecoderIChromaPredDcLeft_mmx (uint8_t* pPred, const int32_t kiStride);
 void WelsDecoderIChromaPredDcNA_mmx (uint8_t* pPred, const int32_t kiStride);
-
-
 
 void WelsDecoderI4x4LumaPredH_sse2 (uint8_t* pPred, const int32_t kiStride);
 void WelsDecoderI4x4LumaPredDDR_mmx (uint8_t* pPred, const int32_t kiStride);
--- a/codec/decoder/core/inc/parameter_sets.h
+++ b/codec/decoder/core/inc/parameter_sets.h
@@ -167,12 +167,16 @@
   bool		bRedundantPicCntPresentFlag;
   bool		bWeightedPredFlag;
   uint8_t		uiWeightedBipredIdc;
-  bool bTransform_8x8_mode_flag;
+
+  bool    bTransform8x8ModeFlag;
   //Add for scalinglist support
   bool    bPicScalingMatrixPresentFlag;
   bool    bPicScalingListPresentFlag[12];
   uint8_t  iScalingList4x4[6][16];
   uint8_t  iScalingList8x8[6][64];
+
+  int32_t iSecondChromaQPIndexOffset; //second_chroma_qp_index_offset
+
 } SPps, *PPps;
 
 } // namespace WelsDec
--- a/codec/decoder/core/inc/parse_mb_syn_cabac.h
+++ b/codec/decoder/core/inc/parse_mb_syn_cabac.h
@@ -46,6 +46,7 @@
 int32_t ParseSkipFlagCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, uint32_t& uiSkip);
 int32_t ParseMBTypeISliceCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, uint32_t& uiBinVal);
 int32_t ParseMBTypePSliceCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, uint32_t& uiBinVal);
+int32_t ParseTransformSize8x8FlagCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, bool& bTransformSize8x8Flag);
 int32_t ParseSubMBTypeCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, uint32_t& uiSubMbType);
 int32_t ParseIntraPredModeLumaCabac (PWelsDecoderContext pCtx, int32_t& iBinVal);
 int32_t ParseIntraPredModeChromaCabac (PWelsDecoderContext pCtx, uint8_t uiNeighAvail, int32_t& iBinVal);
@@ -64,6 +65,9 @@
                                   uint32_t& uiBinVal);
 int32_t ParseSignificantCoeffCabac (int32_t* significant, int32_t iResProperty, PWelsDecoderContext pCtx);
 int32_t ParseResidualBlockCabac (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCountCache, SBitStringAux* pBsAux,
+                                 int32_t index, int32_t iMaxNumCoeff, const uint8_t* pScanTable, int32_t iResProperty, int16_t* sTCoeff, uint8_t uiQp,
+                                 PWelsDecoderContext pCtx);
+int32_t ParseResidualBlockCabac8x8 (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCountCache, SBitStringAux* pBsAux,
                                  int32_t index, int32_t iMaxNumCoeff, const uint8_t* pScanTable, int32_t iResProperty, int16_t* sTCoeff, uint8_t uiQp,
                                  PWelsDecoderContext pCtx);
 int32_t ParseIPCMInfoCabac (PWelsDecoderContext pCtx);
--- a/codec/decoder/core/inc/parse_mb_syn_cavlc.h
+++ b/codec/decoder/core/inc/parse_mb_syn_cavlc.h
@@ -53,9 +53,9 @@
 
 void GetNeighborAvailMbType (PWelsNeighAvail pNeighAvail, PDqLayer pCurLayer);
 void WelsFillCacheNonZeroCount (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, PDqLayer pCurLayer);
-void WelsFillCacheConstrain0Intra4x4 (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+void WelsFillCacheConstrain0IntraNxN (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
                                       PDqLayer pCurLayer);
-void WelsFillCacheConstrain1Intra4x4 (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+void WelsFillCacheConstrain1IntraNxN (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
                                       PDqLayer pCurLayer);
 void WelsFillCacheInterCabac (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount,
                          int16_t iMvArray[LIST_A][30][MV_A], int16_t iMvdCache[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer);
@@ -64,29 +64,29 @@
 
 /*!
  * \brief   check iPredMode for intra16x16 eligible or not
- * \param 	input : current iPredMode
- * \param 	output: 0 indicating decoding correctly; -1 means error occurence
+ * \param   input : current iPredMode
+ * \param   output: 0 indicating decoding correctly; -1 means error occurence
  */
 int32_t CheckIntra16x16PredMode (uint8_t uiSampleAvail, int8_t* pMode);
 
 /*!
- * \brief   check iPredMode for intra4x4 eligible or not
- * \param 	input : current iPredMode
- * \param 	output: 0 indicating decoding correctly; -1 means error occurence
+ * \brief   check iPredMode for intraNxN eligible or not
+ * \param   input : current iPredMode
+ * \param   output: 0 indicating decoding correctly; -1 means error occurence
  */
-int32_t CheckIntra4x4PredMode (int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex);
+int32_t CheckIntraNxNPredMode (int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex, bool b8x8);
 
 /*!
  * \brief   check iPredMode for chroma eligible or not
- * \param 	input : current iPredMode
- * \param 	output: 0 indicating decoding correctly; -1 means error occurence
+ * \param   input : current iPredMode
+ * \param   output: 0 indicating decoding correctly; -1 means error occurence
  */
 int32_t CheckIntraChromaPredMode (uint8_t uiSampleAvail, int8_t* pMode);
 
 /*!
  * \brief   predict the mode of intra4x4
- * \param 	input : current intra4x4 block index
- * \param 	output: mode index
+ * \param   input : current intra4x4 block index
+ * \param   output: mode index
  */
 int32_t PredIntra4x4Mode (int8_t* pIntraPredMode, int32_t iIdx4);
 
@@ -107,10 +107,25 @@
                                 uint8_t uiQp,
                                 PWelsDecoderContext pCtx);
 
+// Transform8x8
+int32_t WelsResidualBlockCavlc8x8 (SVlcTable* pVlcTable,
+                                uint8_t* pNonZeroCountCache,
+                                PBitStringAux pBs,
+                                /*int16_t* coeff_level,*/
+                                int32_t iIndex,
+                                int32_t iMaxNumCoeff,
+                                const uint8_t* kpZigzagTable,
+                                int32_t iResidualProperty,
+                                /*short *tCoeffLevel,*/
+                                int16_t* pTCoeff,
+                                int32_t  iIdx4x4,
+                                uint8_t uiQp,
+                                PWelsDecoderContext pCtx);
+
 /*!
  * \brief   parsing inter info (including ref_index and pMvd)
- * \param 	input : decoding context, current mb, bit-stream
- * \param 	output: 0 indicating decoding correctly; -1 means error
+ * \param   input : decoding context, current mb, bit-stream
+ * \param   output: 0 indicating decoding correctly; -1 means error
  */
 int32_t ParseInterInfo (PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30],
                         PBitStringAux pBs);
--- a/codec/decoder/core/inc/rec_mb.h
+++ b/codec/decoder/core/inc/rec_mb.h
@@ -78,6 +78,10 @@
 
 int32_t RecI4x4Chroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
+int32_t RecI8x8Mb (int32_t iMbXy, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
+
+int32_t RecI8x8Luma (int32_t iMbXy, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
+
 int32_t RecI16x16Mb (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
 
 int32_t RecChroma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer);
--- a/codec/decoder/core/inc/wels_common_basis.h
+++ b/codec/decoder/core/inc/wels_common_basis.h
@@ -72,16 +72,19 @@
 #define LUMA_DC_AC   3
 #define CHROMA_DC    4
 #define CHROMA_AC    5
-#define CHROMA_DC_U  6
-#define CHROMA_DC_V  7
-#define CHROMA_AC_U  8
-#define CHROMA_AC_V  9
-#define LUMA_DC_AC_INTRA 10
-#define LUMA_DC_AC_INTER 11
-#define CHROMA_DC_U_INTER  12
-#define CHROMA_DC_V_INTER  13
-#define CHROMA_AC_U_INTER  14
-#define CHROMA_AC_V_INTER  15
+#define LUMA_DC_AC_8  6
+#define CHROMA_DC_U  7
+#define CHROMA_DC_V  8
+#define CHROMA_AC_U  9
+#define CHROMA_AC_V  10
+#define LUMA_DC_AC_INTRA 11
+#define LUMA_DC_AC_INTER 12
+#define CHROMA_DC_U_INTER  13
+#define CHROMA_DC_V_INTER  14
+#define CHROMA_AC_U_INTER  15
+#define CHROMA_AC_V_INTER  16
+#define LUMA_DC_AC_INTRA_8  17
+#define LUMA_DC_AC_INTER_8  18
 
 #define SHIFT_BUFFER(pBitsCache)	{	pBitsCache->pBuf+=2; pBitsCache->uiRemainBits += 16; pBitsCache->uiCache32Bit |= (((pBitsCache->pBuf[2] << 8) | pBitsCache->pBuf[3]) << (32 - pBitsCache->uiRemainBits));	}
 #define POP_BUFFER(pBitsCache, iCount)	{ pBitsCache->uiCache32Bit <<= iCount;	pBitsCache->uiRemainBits -= iCount;	}
@@ -93,7 +96,39 @@
     7, 11, 14, 15,
 };
 
+static const uint8_t g_kuiZigzagScan8x8[64] = { //8x8 block residual zig-zag scan order
+    0,  1,  8,  16, 9,  2,  3,  10,
+    17, 24, 32, 25, 18, 11, 4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13, 6,  7,  14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63,
+};
 
+static const uint8_t g_kuiIdx2CtxSignificantCoeffFlag8x8[64] = {  // Table 9-43, Page 289
+    0,  1,  2,  3,  4,  5,  5,  4,
+    4,  3,  3,  4,  4,  4,  5,  5,
+    4,  4,  4,  4,  3,  3,  6,  7,
+    7,  7,  8,  9, 10,  9,  8,  7,
+    7,  6, 11, 12, 13, 11,  6,  7,
+    8,  9, 14, 10,  9,  8,  6, 11,
+    12, 13, 11, 6,  9, 14, 10,  9,
+    11, 12, 13, 11 ,14, 10, 12, 14,
+};
+
+static const uint8_t g_kuiIdx2CtxLastSignificantCoeffFlag8x8[64] = { // Table 9-43, Page 289
+    0,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,
+    2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    4,  4,  4,  4,  4,  4,  4,  4,
+    5,  5,  5,  5,  6,  6,  6,  6,
+    7,  7,  7,  7,  8,  8,  8,  8,
+};
+
 static inline void GetMbResProperty(int32_t * pMBproperty,int32_t* pResidualProperty,bool bCavlc)
 {
  switch(*pResidualProperty)
@@ -142,8 +177,17 @@
 	  break;
  case CHROMA_AC_V_INTER:
 	  *pMBproperty = 5;
-	  *pResidualProperty =  bCavlc ?CHROMA_AC:CHROMA_AC_V;
+	  *pResidualProperty =  bCavlc ? CHROMA_AC : CHROMA_AC_V;
 	  break;
+    // Reference to Table 7-2
+ case LUMA_DC_AC_INTRA_8:
+    *pMBproperty = 6;
+    *pResidualProperty = LUMA_DC_AC_8;
+    break;
+ case LUMA_DC_AC_INTER_8:
+    *pMBproperty = 7;
+    *pResidualProperty = LUMA_DC_AC_8;
+    break;
  }
   }
 
--- a/codec/decoder/core/src/au_parser.cpp
+++ b/codec/decoder/core/src/au_parser.cpp
@@ -124,6 +124,7 @@
       uiBsZero = pSrcRbsp[iIndex];
       if (0 == uiBsZero) {
         --iNalSize;
+        ++ (*pConsumedBytes);
         --iIndex;
       } else {
         break;
@@ -991,16 +992,10 @@
     WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //seq_scaling_matrix_present_flag
     pSps->bSeqScalingMatrixPresentFlag	= !!uiCode;
 
-    if (pSps->bSeqScalingMatrixPresentFlag)// For high profile, it is not used in current application. FIXME
-
+    if (pSps->bSeqScalingMatrixPresentFlag) {
       WELS_READ_VERIFY (ParseScalingList (pSps, pBs, 0, pSps->bSeqScalingListPresentFlag, pSps->iScalingList4x4,
                                           pSps->iScalingList8x8));
-    //if exist, to parse scalinglist matrix value
-
-    //  WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING,
-    //         "ParseSps(): seq_scaling_matrix_present_flag (%d). Feature not supported.",
-    //       pSps->bSeqScalingMatrixPresentFlag);
-    //return GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);
+    }
   }
   WELS_READ_VERIFY (BsGetUe (pBs, &uiCode)); //log2_max_frame_num_minus4
   WELS_CHECK_SE_UPPER_ERROR (uiCode, SPS_LOG2_MAX_FRAME_NUM_MINUS4_MAX, "log2_max_frame_num_minus4",
@@ -1379,28 +1374,27 @@
   pPps->bConstainedIntraPredFlag              = !!uiCode;
   WELS_READ_VERIFY (BsGetOneBit (pBsAux, &uiCode)); //redundant_pic_cnt_present_flag
   pPps->bRedundantPicCntPresentFlag           = !!uiCode;
-  /*TODO: to judge whether going on to parse*/
-//going on to parse high profile syntax, need fix me
-  if (0) {
-    WELS_READ_VERIFY (BsGetOneBit (pBsAux, &uiCode));
-    pPps->bTransform_8x8_mode_flag = !!uiCode;
-    WELS_READ_VERIFY (BsGetOneBit (pBsAux, &uiCode));
+
+  if (CheckMoreRBSPData (pBsAux)) {
+    WELS_READ_VERIFY (BsGetOneBit (pBsAux, &uiCode)); //transform_8x8_mode_flag
+    pPps->bTransform8x8ModeFlag = !!uiCode;
+    WELS_READ_VERIFY (BsGetOneBit (pBsAux, &uiCode)); //pic_scaling_matrix_present_flag
     pPps->bPicScalingMatrixPresentFlag = !!uiCode;
     if (pPps->bPicScalingMatrixPresentFlag) {
-      if (pCtx->bSpsAvailFlags[pPps->iSpsId])
+      if (pCtx->bSpsAvailFlags[pPps->iSpsId]) {
         WELS_READ_VERIFY (ParseScalingList (&pCtx->sSpsBuffer[pPps->iSpsId], pBsAux, 1, pPps->bPicScalingListPresentFlag,
                                             pPps->iScalingList4x4, pPps->iScalingList8x8));
-      else {
+      } else {
         pCtx->bSpsLatePps = true;
         WELS_READ_VERIFY (ParseScalingList (NULL, pBsAux, 1, pPps->bPicScalingListPresentFlag, pPps->iScalingList4x4,
                                             pPps->iScalingList8x8));
       }
     }
-    //add second chroma qp parsing process
-    WELS_READ_VERIFY (BsGetSe (pBsAux, &iCode)); //chroma_qp_index_offset,cr
-    pPps->iChromaQpIndexOffset[1]               = iCode;
-    WELS_CHECK_SE_BOTH_ERROR (pPps->iChromaQpIndexOffset[1], PPS_CHROMA_QP_INDEX_OFFSET_MIN, PPS_CHROMA_QP_INDEX_OFFSET_MAX,
-                              "second_chroma_qp_index_offset", GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_CHROMA_QP_INDEX_OFFSET));
+    WELS_READ_VERIFY (BsGetSe (pBsAux, &iCode)); //second_chroma_qp_index_offset
+    pPps->iChromaQpIndexOffset[1] = iCode;
+    WELS_CHECK_SE_BOTH_ERROR (pPps->iChromaQpIndexOffset[1], PPS_CHROMA_QP_INDEX_OFFSET_MIN,
+                              PPS_CHROMA_QP_INDEX_OFFSET_MAX, "chroma_qp_index_offset", GENERATE_ERROR_NO (ERR_LEVEL_PARAM_SETS,
+                                  ERR_INFO_INVALID_CHROMA_QP_INDEX_OFFSET));
   }
 
   if (pCtx->pAccessUnitList->uiAvailUnitsNum > 0) {
@@ -1481,6 +1475,7 @@
   int iNextScale = 8;
   int iDeltaScale;
   int32_t iCode;
+  int32_t iIdx;
   for (int j = 0; j < iScalingListNum; j++) {
     if (iNextScale != 0) {
       WELS_READ_VERIFY (BsGetSe (pBsAux, &iCode));
@@ -1492,8 +1487,9 @@
       if (*bUseDefaultScalingMatrixFlag)
         break;
     }
-    pScalingList[g_kuiZigzagScan[j]] = (iNextScale == 0) ? iLastScale : iNextScale;
-    iLastScale = pScalingList[g_kuiZigzagScan[j]];
+    iIdx = iScalingListNum == 16 ? g_kuiZigzagScan[j] : g_kuiZigzagScan8x8[j];
+    pScalingList[iIdx] = (iNextScale == 0) ? iLastScale : iNextScale;
+    iLastScale = pScalingList[iIdx];
   }
 
 
--- a/codec/decoder/core/src/cabac_decoder.cpp
+++ b/codec/decoder/core/src/cabac_decoder.cpp
@@ -74,7 +74,7 @@
   uint8_t* pCurr;
 
   pCurr = pBsAux->pCurBuf - iRemainingBytes;
-  if(pCurr >= (pBsAux->pEndBuf - 1)) {
+  if (pCurr >= (pBsAux->pEndBuf - 1)) {
     return ERR_INFO_INVALID_ACCESS;
   }
   pDecEngine->uiOffset = ((pCurr[0] << 16) | (pCurr[1] << 8) | pCurr[2]);
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -136,6 +136,19 @@
   },
 };
 
+static const uint8_t g_kuiTableB8x8Idx[2][16] = {
+  {
+    0,  1,  4,  5,  8,  9,  12, 13,   // 0   1 |  2  3
+    2,  3,  6,  7, 10, 11,  14, 15    // 4   5 |  6  7
+  },                                  // ------------
+  // 8   9 | 10 11
+  {
+    // 12 13 | 14 15
+    0,  1,  4,  5,  2,  3,  6,  7,
+    8,  9,  12, 13, 10, 11, 14, 15
+  },
+};
+
 #define TC0_TBL_LOOKUP(tc, iIndexA, pBS, bChroma) \
 {\
 	tc[0] = g_kiTc0Table(iIndexA)[pBS[0]] + bChroma;\
@@ -170,7 +183,22 @@
   nBS[0][2][3] = (pNnzTab[13] | pNnzTab[14]) << iLShiftFactor;
   nBS[0][3][3] = (pNnzTab[14] | pNnzTab[15]) << iLShiftFactor;
   * (uint32_t*)nBS[1][3] = (uiNnz32b2 | uiNnz32b3) << iLShiftFactor;
+}
 
+void inline DeblockingBSInsideMBAvsbase8x8 (int8_t* pNnzTab, uint8_t nBS[2][4][4], int32_t iLShiftFactor) {
+  int8_t i8x8NnzTab[4];
+  for (int32_t i = 0; i < 4; i++) {
+    int32_t iBlkIdx = i << 2;
+    i8x8NnzTab[i] = (pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx]] | pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx + 1]] |
+                     pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx + 2]] | pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx + 3]]);
+  }
+
+  //vertical
+  nBS[0][2][0] = nBS[0][2][1] = (i8x8NnzTab[0] | i8x8NnzTab[1]) << iLShiftFactor;
+  nBS[0][2][2] = nBS[0][2][3] = (i8x8NnzTab[2] | i8x8NnzTab[3]) << iLShiftFactor;
+  //horizontal
+  nBS[1][2][0] = nBS[1][2][1] = (i8x8NnzTab[0] | i8x8NnzTab[2]) << iLShiftFactor;
+  nBS[1][2][2] = nBS[1][2][3] = (i8x8NnzTab[1] | i8x8NnzTab[3]) << iLShiftFactor;
 }
 
 void static inline DeblockingBSInsideMBNormal (PDqLayer pCurDqLayer, uint8_t nBS[2][4][4], int8_t* pNnzTab,
@@ -179,73 +207,148 @@
   int8_t* iRefIndex = pCurDqLayer->pRefIndex[LIST_0][iMbXy];
   ENFORCE_STACK_ALIGN_1D (uint8_t, uiBsx4, 4, 4);
 
-  uiNnz32b0 = * (uint32_t*) (pNnzTab + 0);
-  uiNnz32b1 = * (uint32_t*) (pNnzTab + 4);
-  uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
-  uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
+  int8_t i8x8NnzTab[4];
 
-  for (int i = 0; i < 3; i++)
-    uiBsx4[i] = pNnzTab[i] | pNnzTab[i + 1];
-  nBS[0][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 1, 0);
-  nBS[0][2][0] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 2, 1);
-  nBS[0][3][0] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 3, 2);
+  if (pCurDqLayer->pTransformSize8x8Flag[iMbXy]) {
+    for (int32_t i = 0; i < 4; i++) {
+      int32_t iBlkIdx = i << 2;
+      i8x8NnzTab[i] = (pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx]] | pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx + 1]] |
+                       pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx + 2]] | pNnzTab[g_kuiMbCountScan4Idx[iBlkIdx + 3]]);
+    }
+    //vertical
+    nBS[0][2][0] = nBS[0][2][1] = BS_EDGE ((i8x8NnzTab[0] | i8x8NnzTab[1]), iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy],
+                                           g_kuiMbCountScan4Idx[1 << 2], g_kuiMbCountScan4Idx[0]);
+    nBS[0][2][2] = nBS[0][2][3] = BS_EDGE ((i8x8NnzTab[2] | i8x8NnzTab[3]), iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy],
+                                           g_kuiMbCountScan4Idx[3 << 2], g_kuiMbCountScan4Idx[2 << 2]);
 
-  for (int i = 0; i < 3; i++)
-    uiBsx4[i] = pNnzTab[4 + i] | pNnzTab[4 + i + 1];
-  nBS[0][1][1] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 4);
-  nBS[0][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 5);
-  nBS[0][3][1] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 6);
+    //horizontal
+    nBS[1][2][0] = nBS[1][2][1] = BS_EDGE ((i8x8NnzTab[0] | i8x8NnzTab[2]), iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy],
+                                           g_kuiMbCountScan4Idx[2 << 2], g_kuiMbCountScan4Idx[0]);
+    nBS[1][2][2] = nBS[1][2][3] = BS_EDGE ((i8x8NnzTab[1] | i8x8NnzTab[3]), iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy],
+                                           g_kuiMbCountScan4Idx[3 << 2], g_kuiMbCountScan4Idx[1 << 2]);
+  } else {
+    uiNnz32b0 = * (uint32_t*) (pNnzTab + 0);
+    uiNnz32b1 = * (uint32_t*) (pNnzTab + 4);
+    uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
+    uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
 
-  for (int i = 0; i < 3; i++)
-    uiBsx4[i] = pNnzTab[8 + i] | pNnzTab[8 + i + 1];
-  nBS[0][1][2] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 8);
-  nBS[0][2][2] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 9);
-  nBS[0][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 10);
+    for (int i = 0; i < 3; i++)
+      uiBsx4[i] = pNnzTab[i] | pNnzTab[i + 1];
+    nBS[0][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 1, 0);
+    nBS[0][2][0] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 2, 1);
+    nBS[0][3][0] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 3, 2);
 
-  for (int i = 0; i < 3; i++)
-    uiBsx4[i] = pNnzTab[12 + i] | pNnzTab[12 + i + 1];
-  nBS[0][1][3] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 12);
-  nBS[0][2][3] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 13);
-  nBS[0][3][3] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 14);
+    for (int i = 0; i < 3; i++)
+      uiBsx4[i] = pNnzTab[4 + i] | pNnzTab[4 + i + 1];
+    nBS[0][1][1] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 4);
+    nBS[0][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 5);
+    nBS[0][3][1] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 6);
 
-  // horizontal
-  * (uint32_t*)uiBsx4 = (uiNnz32b0 | uiNnz32b1);
-  nBS[1][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 4, 0);
-  nBS[1][1][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 1);
-  nBS[1][1][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 2);
-  nBS[1][1][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 3);
+    for (int i = 0; i < 3; i++)
+      uiBsx4[i] = pNnzTab[8 + i] | pNnzTab[8 + i + 1];
+    nBS[0][1][2] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 8);
+    nBS[0][2][2] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 9);
+    nBS[0][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 10);
 
-  * (uint32_t*)uiBsx4 = (uiNnz32b1 | uiNnz32b2);
-  nBS[1][2][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 8, 4);
-  nBS[1][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 5);
-  nBS[1][2][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 6);
-  nBS[1][2][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 7);
+    for (int i = 0; i < 3; i++)
+      uiBsx4[i] = pNnzTab[12 + i] | pNnzTab[12 + i + 1];
+    nBS[0][1][3] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 12);
+    nBS[0][2][3] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 13);
+    nBS[0][3][3] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 14);
 
-  * (uint32_t*)uiBsx4 = (uiNnz32b2 | uiNnz32b3);
-  nBS[1][3][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 12, 8);
-  nBS[1][3][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 9);
-  nBS[1][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 10);
-  nBS[1][3][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 11);
+    // horizontal
+    * (uint32_t*)uiBsx4 = (uiNnz32b0 | uiNnz32b1);
+    nBS[1][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 4, 0);
+    nBS[1][1][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 1);
+    nBS[1][1][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 2);
+    nBS[1][1][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 3);
+
+    * (uint32_t*)uiBsx4 = (uiNnz32b1 | uiNnz32b2);
+    nBS[1][2][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 8, 4);
+    nBS[1][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 5);
+    nBS[1][2][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 6);
+    nBS[1][2][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 7);
+
+    * (uint32_t*)uiBsx4 = (uiNnz32b2 | uiNnz32b3);
+    nBS[1][3][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 12, 8);
+    nBS[1][3][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 9);
+    nBS[1][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 10);
+    nBS[1][3][3] = BS_EDGE (uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 11);
+  }
 }
 
 uint32_t DeblockingBsMarginalMBAvcbase (PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy) {
-  int32_t i;
+  int32_t i, j;
   uint32_t uiBSx4;
-  //uint8_t* bS = static_cast<uint8_t*>(&uiBSx4);
   uint8_t* pBS = (uint8_t*) (&uiBSx4);
-  const uint8_t* pBIdx  = &g_kuiTableBIdx[iEdge][0];
-  const uint8_t* pBnIdx = &g_kuiTableBIdx[iEdge][4];
+  const uint8_t* pBIdx      = &g_kuiTableBIdx[iEdge][0];
+  const uint8_t* pBnIdx     = &g_kuiTableBIdx[iEdge][4];
+  const uint8_t* pB8x8Idx   = &g_kuiTableB8x8Idx[iEdge][0];
+  const uint8_t* pBn8x8Idx  = &g_kuiTableB8x8Idx[iEdge][8];
 
-  for (i = 0; i < 4; i++) {
-    if (pCurDqLayer->pNzc[iMbXy][*pBIdx] | pCurDqLayer->pNzc[iNeighMb][*pBnIdx]) {
-      pBS[i] = 2;
-    } else {
-      pBS[i] = MB_BS_MV (pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, *pBIdx,
-                         *pBnIdx);
+  if (pCurDqLayer->pTransformSize8x8Flag[iMbXy] && pCurDqLayer->pTransformSize8x8Flag[iNeighMb]) {
+    for (i = 0; i < 2; i++) {
+      uint8_t uiNzc = 0;
+      for (j = 0; uiNzc == 0 && j < 4; j++) {
+        uiNzc |= (pCurDqLayer->pNzc[iMbXy][* (pB8x8Idx + j)] | pCurDqLayer->pNzc[iNeighMb][* (pBn8x8Idx + j)]);
+      }
+      if (uiNzc) {
+        pBS[i << 1] = pBS[1 + (i << 1)] = 2;
+      } else {
+        pBS[i << 1] = pBS[1 + (i << 1)] = MB_BS_MV (pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb,
+                                          *pB8x8Idx, *pBn8x8Idx);
+      }
+      pB8x8Idx += 4;
+      pBn8x8Idx += 4;
     }
-    pBIdx++;
-    pBnIdx++;
+  } else if (pCurDqLayer->pTransformSize8x8Flag[iMbXy]) {
+    for (i = 0; i < 2; i++) {
+      uint8_t uiNzc = 0;
+      for (j = 0; uiNzc == 0 && j < 4; j++) {
+        uiNzc |= pCurDqLayer->pNzc[iMbXy][* (pB8x8Idx + j)];
+      }
+      for (j = 0; j < 2; j++) {
+        if (uiNzc | pCurDqLayer->pNzc[iNeighMb][*pBnIdx]) {
+          pBS[j + (i << 1)] = 2;
+        } else {
+          pBS[j + (i << 1)] = MB_BS_MV (pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, *pB8x8Idx,
+                                        *pBnIdx);
+        }
+        pBnIdx++;
+      }
+      pB8x8Idx += 4;
+    }
+  } else if (pCurDqLayer->pTransformSize8x8Flag[iNeighMb]) {
+    for (i = 0; i < 2; i++) {
+      uint8_t uiNzc = 0;
+      for (j = 0; uiNzc == 0 && j < 4; j++) {
+        uiNzc |= pCurDqLayer->pNzc[iNeighMb][* (pBn8x8Idx + j)];
+      }
+      for (j = 0; j < 2; j++) {
+        if (uiNzc | pCurDqLayer->pNzc[iMbXy][*pBIdx]) {
+          pBS[j + (i << 1)] = 2;
+        } else {
+          pBS[j + (i << 1)] = MB_BS_MV (pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, *pBIdx,
+                                        *pBn8x8Idx);
+        }
+        pBIdx++;
+      }
+      pBn8x8Idx += 4;
+    }
+  } else {
+    // only 4x4 transform
+    for (i = 0; i < 4; i++) {
+      if (pCurDqLayer->pNzc[iMbXy][*pBIdx] | pCurDqLayer->pNzc[iNeighMb][*pBnIdx]) {
+        pBS[i] = 2;
+      } else {
+        pBS[i] = MB_BS_MV (pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, *pBIdx,
+                           *pBnIdx);
+      }
+      pBIdx++;
+      pBnIdx++;
+    }
   }
+
   return uiBSx4;
 }
 int32_t DeblockingAvailableNoInterlayer (PDqLayer pCurDqLayer, int32_t iFilterIdc) {
@@ -501,7 +604,7 @@
   pFilter->iChromaQP[0] = pCurChromaQp[0];
   pFilter->iChromaQP[1] = pCurChromaQp[1];
 
-  if (* (uint32_t*)nBS[0][1] != 0) {
+  if (* (uint32_t*)nBS[0][1] != 0 && !pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
     FilteringEdgeLumaV (pFilter, &pDestY[1 << 2], iLineSize, nBS[0][1]);
   }
 
@@ -510,7 +613,7 @@
     FilteringEdgeChromaV (pFilter, &pDestCb[2 << 1], &pDestCr[2 << 1], iLineSizeUV, nBS[0][2]);
   }
 
-  if (* (uint32_t*)nBS[0][3] != 0) {
+  if (* (uint32_t*)nBS[0][3] != 0  && !pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
     FilteringEdgeLumaV (pFilter, &pDestY[3 << 2], iLineSize, nBS[0][3]);
   }
 
@@ -536,7 +639,7 @@
   pFilter->iChromaQP[0] = pCurChromaQp[0];
   pFilter->iChromaQP[1] = pCurChromaQp[1];
 
-  if (* (uint32_t*)nBS[1][1] != 0) {
+  if (* (uint32_t*)nBS[1][1] != 0  && !pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
     FilteringEdgeLumaH (pFilter, &pDestY[ (1 << 2)*iLineSize], iLineSize, nBS[1][1]);
   }
 
@@ -546,7 +649,7 @@
                           nBS[1][2]);
   }
 
-  if (* (uint32_t*)nBS[1][3] != 0) {
+  if (* (uint32_t*)nBS[1][3] != 0  && !pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
     FilteringEdgeLumaH (pFilter, &pDestY[ (3 << 2)*iLineSize], iLineSize, nBS[1][3]);
   }
 }
@@ -581,9 +684,16 @@
                           iBeta);
   if (iAlpha | iBeta) {
     TC0_TBL_LOOKUP (iTc, iIndexA, uiBSx4, 0);
-    pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[1 << 2], iLineSize, iAlpha, iBeta, iTc);
+
+    if (!pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
+      pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[1 << 2], iLineSize, iAlpha, iBeta, iTc);
+    }
+
     pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[2 << 2], iLineSize, iAlpha, iBeta, iTc);
-    pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[3 << 2], iLineSize, iAlpha, iBeta, iTc);
+
+    if (!pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
+      pFilter->pLoopf->pfLumaDeblockingLT4Hor (&pDestY[3 << 2], iLineSize, iAlpha, iBeta, iTc);
+    }
   }
 
   // luma h
@@ -594,9 +704,15 @@
 
   pFilter->iLumaQP   = iCurQp;
   if (iAlpha | iBeta) {
-    pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (1 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    if (!pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
+      pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (1 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    }
+
     pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (2 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
-    pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (3 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+
+    if (!pCurDqLayer->pTransformSize8x8Flag[iMbXyIndex]) {
+      pFilter->pLoopf->pfLumaDeblockingLT4Ver (&pDestY[ (3 << 2)*iLineSize], iLineSize, iAlpha, iBeta, iTc);
+    }
   }
 }
 void FilteringEdgeChromaHV (PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag) {
@@ -705,6 +821,7 @@
 
   switch (iCurMbType) {
   case MB_TYPE_INTRA4x4:
+  case MB_TYPE_INTRA8x8:
   case MB_TYPE_INTRA16x16:
   case MB_TYPE_INTRA_PCM:
     DeblockingIntraMb (pCurDqLayer, pFilter, iBoundryFlag);
@@ -728,7 +845,11 @@
     //SKIP MB_16x16 or others
     if (iCurMbType != MB_TYPE_SKIP) {
       if (iCurMbType == MB_TYPE_16x16) {
-        DeblockingBSInsideMBAvsbase (pCurDqLayer->pNzc[iMbXyIndex], nBS, 1);
+        if (!pCurDqLayer->pTransformSize8x8Flag[pCurDqLayer->iMbXyIndex]) {
+          DeblockingBSInsideMBAvsbase (pCurDqLayer->pNzc[iMbXyIndex], nBS, 1);
+        } else {
+          DeblockingBSInsideMBAvsbase8x8 (pCurDqLayer->pNzc[iMbXyIndex], nBS, 1);
+        }
       } else {
         DeblockingBSInsideMBNormal (pCurDqLayer, nBS, pCurDqLayer->pNzc[iMbXyIndex], iMbXyIndex);
       }
@@ -839,8 +960,8 @@
   if (iCpu & WELS_CPU_SSSE3) {
     pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_ssse3;
     pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_ssse3;
-    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_ssse3;
-    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_ssse3;
+    pFunc->pfLumaDeblockingLT4Hor       = DeblockLumaLt4H_ssse3;
+    pFunc->pfLumaDeblockingEQ4Hor       = DeblockLumaEq4H_ssse3;
     pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_ssse3;
     pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_ssse3;
     pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_ssse3;
--- a/codec/decoder/core/src/decode_mb_aux.cpp
+++ b/codec/decoder/core/src/decode_mb_aux.cpp
@@ -76,6 +76,96 @@
   }
 }
 
+void IdctResAddPred8x8_c (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {
+  // To make the ASM code easy to write, should using one funciton to apply hor and ver together, such as we did on HEVC
+  // Ugly code, just for easy debug, the final version need optimization
+  int16_t p[8], b[8];
+  int16_t a[4];
+
+  int16_t iTmp[64];
+  int16_t iRes[64];
+
+  // Horizontal
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      p[j] = pRs[j + (i << 3)];
+    }
+    a[0] = p[0] + p[4];
+    a[1] = p[0] - p[4];
+    a[2] = p[6] - (p[2] >> 1);
+    a[3] = p[2] + (p[6] >> 1);
+
+    b[0] =  a[0] + a[3];
+    b[2] =  a[1] - a[2];
+    b[4] =  a[1] + a[2];
+    b[6] =  a[0] - a[3];
+
+    a[0] = -p[3] + p[5] - p[7] - (p[7] >> 1);
+    a[1] =  p[1] + p[7] - p[3] - (p[3] >> 1);
+    a[2] = -p[1] + p[7] + p[5] + (p[5] >> 1);
+    a[3] =  p[3] + p[5] + p[1] + (p[1] >> 1);
+
+    b[1] =  a[0] + (a[3] >> 2);
+    b[3] =  a[1] + (a[2] >> 2);
+    b[5] =  a[2] - (a[1] >> 2);
+    b[7] =  a[3] - (a[0] >> 2);
+
+    iTmp[0 + (i << 3)] = b[0] + b[7];
+    iTmp[1 + (i << 3)] = b[2] - b[5];
+    iTmp[2 + (i << 3)] = b[4] + b[3];
+    iTmp[3 + (i << 3)] = b[6] + b[1];
+    iTmp[4 + (i << 3)] = b[6] - b[1];
+    iTmp[5 + (i << 3)] = b[4] - b[3];
+    iTmp[6 + (i << 3)] = b[2] + b[5];
+    iTmp[7 + (i << 3)] = b[0] - b[7];
+  }
+
+  //Vertical
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      p[j] = iTmp[i + (j << 3)];
+    }
+
+    a[0] =  p[0] + p[4];
+    a[1] =  p[0] - p[4];
+    a[2] =  p[6] - (p[2] >> 1);
+    a[3] =  p[2] + (p[6] >> 1);
+
+    b[0] = a[0] + a[3];
+    b[2] = a[1] - a[2];
+    b[4] = a[1] + a[2];
+    b[6] = a[0] - a[3];
+
+    a[0] = -p[3] + p[5] - p[7] - (p[7] >> 1);
+    a[1] =  p[1] + p[7] - p[3] - (p[3] >> 1);
+    a[2] = -p[1] + p[7] + p[5] + (p[5] >> 1);
+    a[3] =  p[3] + p[5] + p[1] + (p[1] >> 1);
+
+
+    b[1] =  a[0] + (a[3] >> 2);
+    b[7] =  a[3] - (a[0] >> 2);
+    b[3] =  a[1] + (a[2] >> 2);
+    b[5] =  a[2] - (a[1] >> 2);
+
+    iRes[ (0 << 3) + i] = b[0] + b[7];
+    iRes[ (1 << 3) + i] = b[2] - b[5];
+    iRes[ (2 << 3) + i] = b[4] + b[3];
+    iRes[ (3 << 3) + i] = b[6] + b[1];
+    iRes[ (4 << 3) + i] = b[6] - b[1];
+    iRes[ (5 << 3) + i] = b[4] - b[3];
+    iRes[ (6 << 3) + i] = b[2] + b[5];
+    iRes[ (7 << 3) + i] = b[0] - b[7];
+  }
+
+  uint8_t* pDst			= pPred;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      pDst[i * kiStride + j] = WelsClip1 (((32 + iRes[ (i << 3) + j]) >> 6) + pDst[i * kiStride + j]);
+    }
+  }
+
+}
+
 void GetI4LumaIChromaAddrTable (int32_t* pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride) {
   int32_t* pOffset	   = pBlockOffset;
   int32_t i;
--- a/codec/decoder/core/src/decode_slice.cpp
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -144,7 +144,6 @@
     return 0;//NO_SUPPORTED_FILTER_IDX
   } else {
     WelsDeblockingFilterSlice (pCtx, pDeblockMb);
-
   }
   // any other filter_idc not supported here, 7/22/2010
 
@@ -159,12 +158,23 @@
   WelsChromaDcIdct (pCurLayer->pScaledTCoeff[iMbXy] + 256);	// 256 = 16*16
   WelsChromaDcIdct (pCurLayer->pScaledTCoeff[iMbXy] + 320);	// 320 = 16*16 + 16*4
 
-  for (i = 0; i < 16; i++) { //luma
-    iIndex = g_kuiMbCountScan4Idx[i];
-    if (pCurLayer->pNzc[iMbXy][iIndex]) {
-      iOffset = ((iIndex >> 2) << 2) * iStrideL + ((iIndex % 4) << 2);
-      pCtx->pIdctResAddPredFunc (pDstY + iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy] + (i << 4));
+  if (pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+    for (i = 0; i < 4; i++) {
+      iIndex = g_kuiMbCountScan4Idx[i << 2];
+      if (pCurLayer->pNzc[iMbXy][iIndex] || pCurLayer->pNzc[iMbXy][iIndex + 1] || pCurLayer->pNzc[iMbXy][iIndex + 4]
+          || pCurLayer->pNzc[iMbXy][iIndex + 5]) {
+        iOffset = ((iIndex >> 2) << 2) * iStrideL + ((iIndex % 4) << 2);
+        pCtx->pIdctResAddPredFunc8x8 (pDstY + iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy] + (i << 6));
+      }
     }
+  } else {
+    for (i = 0; i < 16; i++) { //luma
+      iIndex = g_kuiMbCountScan4Idx[i];
+      if (pCurLayer->pNzc[iMbXy][iIndex]) {
+        iOffset = ((iIndex >> 2) << 2) * iStrideL + ((iIndex % 4) << 2);
+        pCtx->pIdctResAddPredFunc (pDstY + iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy] + (i << 4));
+      }
+    }
   }
 
   for (i = 0; i < 4; i++) { //chroma
@@ -258,6 +268,10 @@
     return 0;
   }
 
+  if (IS_INTRA8x8 (pCurLayer->pMbType[iMbXy])) {
+    RecI8x8Mb (iMbXy, pCtx, pCurLayer->pScaledTCoeff[iMbXy], pCurLayer);
+  }
+
   if (IS_INTRA4x4 (pCurLayer->pMbType[iMbXy]))
     RecI4x4Mb (iMbXy, pCtx, pCurLayer->pScaledTCoeff[iMbXy], pCurLayer);
 
@@ -326,7 +340,7 @@
   pBlk[iStride1] = (iE - iB) >> 1;
 }
 
-void WelsMap4x4NeighToSampleNormal (PWelsNeighAvail pNeighAvail, int32_t* pSampleAvail) {
+void WelsMapNxNNeighToSampleNormal (PWelsNeighAvail pNeighAvail, int32_t* pSampleAvail) {
   if (pNeighAvail->iLeftAvail) {  //left
     pSampleAvail[ 6] =
       pSampleAvail[12] =
@@ -347,7 +361,7 @@
   }
 }
 
-void WelsMap4x4NeighToSampleConstrain1 (PWelsNeighAvail pNeighAvail, int32_t* pSampleAvail) {
+void WelsMapNxNNeighToSampleConstrain1 (PWelsNeighAvail pNeighAvail, int32_t* pSampleAvail) {
   if (pNeighAvail->iLeftAvail && IS_INTRA (pNeighAvail->iLeftType)) {   //left
     pSampleAvail[ 6] =
       pSampleAvail[12] =
@@ -401,7 +415,7 @@
   uint8_t uiNeighAvail = 0;
   uint32_t uiCode;
   int32_t iCode;
-  pCtx->pMap4x4NeighToSampleFunc (pNeighAvail, iSampleAvail);
+  pCtx->pMapNxNNeighToSampleFunc (pNeighAvail, iSampleAvail);
   uiNeighAvail = (iSampleAvail[6] << 2) | (iSampleAvail[0] << 1) | (iSampleAvail[1]);
   for (i = 0; i < 16; i++) {
     int32_t iPrevIntra4x4PredMode = 0;
@@ -429,7 +443,7 @@
       }
     }
 
-    iFinalMode = CheckIntra4x4PredMode (&iSampleAvail[0], &iBestMode, i);
+    iFinalMode = CheckIntraNxNPredMode (&iSampleAvail[0], &iBestMode, i, false);
     if (iFinalMode  == ERR_INVALID_INTRA4X4_MODE) {
       return ERR_INFO_INVALID_I4x4_PRED_MODE;
     }
@@ -469,6 +483,87 @@
   return ERR_NONE;
 }
 
+int32_t ParseIntra8x8Mode (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, int8_t* pIntraPredMode,
+                           PBitStringAux pBs,
+                           PDqLayer pCurDqLayer) {
+  // Similar with Intra_4x4, can put them together when needed
+  int32_t iSampleAvail[5 * 6] = { 0 }; //initialize as 0
+  int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+  int32_t iFinalMode, i;
+
+  uint8_t uiNeighAvail = 0;
+  uint32_t uiCode;
+  int32_t iCode;
+  pCtx->pMapNxNNeighToSampleFunc (pNeighAvail, iSampleAvail);
+  // Top-Right : Left : Top-Left : Top
+  uiNeighAvail = (iSampleAvail[5] << 3) | (iSampleAvail[6] << 2) | (iSampleAvail[0] << 1) | (iSampleAvail[1]);
+
+  pCurDqLayer->pIntraNxNAvailFlag[iMbXy] = uiNeighAvail;
+
+  for (i = 0; i < 4; i++) {
+    int32_t iPrevIntra4x4PredMode = 0;
+    if (pCurDqLayer->sLayerInfo.pPps->bEntropyCodingModeFlag) {
+      WELS_READ_VERIFY (ParseIntraPredModeLumaCabac (pCtx, iCode));
+      iPrevIntra4x4PredMode = iCode;
+    } else {
+      WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode));
+      iPrevIntra4x4PredMode = uiCode;
+    }
+    const int32_t kiPredMode = PredIntra4x4Mode (pIntraPredMode, i << 2);
+
+    int8_t iBestMode;
+    if (pCurDqLayer->sLayerInfo.pPps->bEntropyCodingModeFlag) {
+      if (iPrevIntra4x4PredMode == -1)
+        iBestMode = kiPredMode;
+      else
+        iBestMode = iPrevIntra4x4PredMode + (iPrevIntra4x4PredMode >= kiPredMode);
+    } else {
+      if (iPrevIntra4x4PredMode) {
+        iBestMode = kiPredMode;
+      } else {
+        WELS_READ_VERIFY (BsGetBits (pBs, 3, &uiCode));
+        iBestMode = uiCode + ((int32_t) uiCode >= kiPredMode);
+      }
+    }
+
+    iFinalMode = CheckIntraNxNPredMode (&iSampleAvail[0], &iBestMode, i << 2, true);
+
+    if (iFinalMode  == ERR_INVALID_INTRA4X4_MODE) {
+      return ERR_INFO_INVALID_I4x4_PRED_MODE;
+    }
+
+    for (int j = 0; j < 4; j++) {
+      pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[ (i << 2) + j]] = iFinalMode;
+      pIntraPredMode[g_kuiScan8[ (i << 2) + j]] = iBestMode;
+      iSampleAvail[g_kuiCache30ScanIdx[ (i << 2) + j]] = 1;
+    }
+  }
+  ST32 (&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32 (&pIntraPredMode[1 + 8 * 4]));
+  pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
+  pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
+  pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
+  if (pCurDqLayer->sLayerInfo.pPps->bEntropyCodingModeFlag) {
+    WELS_READ_VERIFY (ParseIntraPredModeChromaCabac (pCtx, uiNeighAvail, iCode));
+    if (iCode > MAX_PRED_MODE_ID_CHROMA) {
+      return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+    }
+    pCurDqLayer->pChromaPredMode[iMbXy] = iCode;
+  } else {
+    WELS_READ_VERIFY (BsGetUe (pBs, &uiCode)); //intra_chroma_pred_mode
+    if (uiCode > MAX_PRED_MODE_ID_CHROMA) {
+      return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+    }
+    pCurDqLayer->pChromaPredMode[iMbXy] = uiCode;
+  }
+
+  if (-1 == pCurDqLayer->pChromaPredMode[iMbXy]
+      || CheckIntraChromaPredMode (uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy])) {
+    return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+  }
+
+  return ERR_NONE;
+}
+
 int32_t ParseIntra16x16Mode (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, PBitStringAux pBs,
                              PDqLayer pCurDqLayer) {
   int32_t iMbXy = pCurDqLayer->iMbXyIndex;
@@ -519,6 +614,9 @@
 
   ENFORCE_STACK_ALIGN_1D (uint8_t, pNonZeroCount, 48, 16);
 
+  pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
+  pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+
   pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
   pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
   GetNeighborAvailMbType (&sNeighAvail, pCurLayer);
@@ -539,8 +637,18 @@
   } else if (0 == uiMbType) { //I4x4
     ENFORCE_STACK_ALIGN_1D (int8_t, pIntraPredMode, 48, 16);
     pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
-    pCtx->pFillInfoCacheIntra4x4Func (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
-    WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, &sNeighAvail, pIntraPredMode, pBsAux, pCurLayer));
+    if (pCtx->pPps->bTransform8x8ModeFlag) {
+      // Transform 8x8 cabac will be added soon
+      WELS_READ_VERIFY (ParseTransformSize8x8FlagCabac (pCtx, &sNeighAvail, pCtx->pCurDqLayer->pTransformSize8x8Flag[iMbXy]));
+    }
+    if (pCtx->pCurDqLayer->pTransformSize8x8Flag[iMbXy]) {
+      uiMbType = pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA8x8;
+      pCtx->pFillInfoCacheIntraNxNFunc (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+      WELS_READ_VERIFY (ParseIntra8x8Mode (pCtx, &sNeighAvail, pIntraPredMode, pBsAux, pCurLayer));
+    } else {
+      pCtx->pFillInfoCacheIntraNxNFunc (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+      WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, &sNeighAvail, pIntraPredMode, pBsAux, pCurLayer));
+    }
     //get uiCbp for I4x4
     WELS_READ_VERIFY (ParseCbpInfoCabac (pCtx, &sNeighAvail, uiCbp));
     pCurLayer->pCbp[iMbXy] = uiCbp;
@@ -549,6 +657,8 @@
     uiCbpLuma = uiCbp & 15;
   } else { //I16x16;
     pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+    pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+    pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
     pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType - 1) & 3;
     pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[ (uiMbType - 1) >> 2];
     uiCbpChroma = pCtx->pSps->uiChromaFormatIdc ? pCurLayer->pCbp[iMbXy] >> 4 : 0 ;
@@ -565,7 +675,7 @@
   ST32 (&pCurLayer->pNzc[iMbXy][20], 0);
   pCurLayer->pCbfDc[iMbXy] = 0;
 
-  if (pCurLayer->pCbp[iMbXy] == 0 && IS_INTRA4x4 (pCurLayer->pMbType[iMbXy])) {
+  if (pCurLayer->pCbp[iMbXy] == 0 && IS_INTRANxN (pCurLayer->pMbType[iMbXy])) {
     pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
     for (i = 0; i < 2; i++) {
       pCurLayer->pChromaQp[iMbXy][i] = g_kuiChromaQpTable[WELS_CLIP3 ((pCurLayer->pLumaQp[iMbXy] +
@@ -608,26 +718,43 @@
         ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
       }
     } else { //non-MB_TYPE_INTRA16x16
-      for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
-        if (uiCbpLuma & (1 << iId8x8)) {
-          int32_t iIdx = (iId8x8 << 2);
-          for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
-            //Luma (DC and AC decoding together)
-            WELS_READ_VERIFY (ParseResidualBlockCabac (&sNeighAvail, pNonZeroCount, pBsAux, iIdx, iScanIdxEnd - iScanIdxStart + 1,
-                              g_kuiZigzagScan + iScanIdxStart, LUMA_DC_AC_INTRA, pCurLayer->pScaledTCoeff[iMbXy] + (iIdx << 4),
-                              pCurLayer->pLumaQp[iMbXy],
-                              pCtx));
-            iIdx++;
+      if (pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+        // Transform 8x8 support for CABAC
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          if (uiCbpLuma & (1 << iId8x8)) {
+            WELS_READ_VERIFY (ParseResidualBlockCabac8x8 (&sNeighAvail, pNonZeroCount, pBsAux, (iId8x8 << 2),
+                              iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan8x8 + iScanIdxStart, LUMA_DC_AC_INTRA_8,
+                              pCurLayer->pScaledTCoeff[iMbXy] + (iId8x8 << 6), pCurLayer->pLumaQp[iMbXy], pCtx));
+          } else {
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2)]], 0);
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
           }
-        } else {
-          ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2)]], 0);
-          ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
         }
+        ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+      } else {
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          if (uiCbpLuma & (1 << iId8x8)) {
+            int32_t iIdx = (iId8x8 << 2);
+            for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+              //Luma (DC and AC decoding together)
+              WELS_READ_VERIFY (ParseResidualBlockCabac (&sNeighAvail, pNonZeroCount, pBsAux, iIdx, iScanIdxEnd - iScanIdxStart + 1,
+                                g_kuiZigzagScan + iScanIdxStart, LUMA_DC_AC_INTRA, pCurLayer->pScaledTCoeff[iMbXy] + (iIdx << 4),
+                                pCurLayer->pLumaQp[iMbXy], pCtx));
+              iIdx++;
+            }
+          } else {
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2)]], 0);
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
+          }
+        }
+        ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
       }
-      ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
-      ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
-      ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
-      ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
     }
     int32_t iMbResProperty;
     //chroma
@@ -730,10 +857,21 @@
       if (0 == uiMbType) { //Intra4x4
         ENFORCE_STACK_ALIGN_1D (int8_t, pIntraPredMode, 48, 16);
         pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
-        pCtx->pFillInfoCacheIntra4x4Func (pNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
-        WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, pNeighAvail, pIntraPredMode, pBsAux, pCurLayer));
+        if (pCtx->pPps->bTransform8x8ModeFlag) {
+          WELS_READ_VERIFY (ParseTransformSize8x8FlagCabac (pCtx, pNeighAvail, pCtx->pCurDqLayer->pTransformSize8x8Flag[iMbXy]));
+        }
+        if (pCtx->pCurDqLayer->pTransformSize8x8Flag[iMbXy]) {
+          uiMbType = pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA8x8;
+          pCtx->pFillInfoCacheIntraNxNFunc (pNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+          WELS_READ_VERIFY (ParseIntra8x8Mode (pCtx, pNeighAvail, pIntraPredMode, pBsAux, pCurLayer));
+        } else {
+          pCtx->pFillInfoCacheIntraNxNFunc (pNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+          WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, pNeighAvail, pIntraPredMode, pBsAux, pCurLayer));
+        }
       } else { //Intra16x16
         pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+        pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+        pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
         pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType - 1) & 3;
         pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[ (uiMbType - 1) >> 2];
         uiCbpChroma = pCtx->pSps->uiChromaFormatIdc ? pCurLayer->pCbp[iMbXy] >> 4 : 0;
@@ -761,6 +899,23 @@
   }
 
   if (pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy]) {
+
+    if (MB_TYPE_INTRA16x16 != pCurLayer->pMbType[iMbXy]) {
+      // Need modification when B picutre add in
+      bool bNeedParseTransformSize8x8Flag =
+        (((pCurLayer->pMbType[iMbXy] >= MB_TYPE_16x16 && pCurLayer->pMbType[iMbXy] <= MB_TYPE_8x16)
+          || pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy])
+         && (pCurLayer->pMbType[iMbXy] != MB_TYPE_INTRA8x8)
+         && (pCurLayer->pMbType[iMbXy] != MB_TYPE_INTRA4x4)
+         && ((pCurLayer->pCbp[iMbXy] & 0x0F) > 0)
+         && (pCtx->pPps->bTransform8x8ModeFlag));
+
+      if (bNeedParseTransformSize8x8Flag) {
+        WELS_READ_VERIFY (ParseTransformSize8x8FlagCabac (pCtx, pNeighAvail,
+                          pCtx->pCurDqLayer->pTransformSize8x8Flag[iMbXy])); //transform_size_8x8_flag
+      }
+    }
+
     memset (pCurLayer->pScaledTCoeff[iMbXy], 0, 384 * sizeof (pCurLayer->pScaledTCoeff[iMbXy][0]));
 
     int32_t iQpDelta, iId8x8, iId4x4;
@@ -798,27 +953,46 @@
         ST32 (&pCurLayer->pNzc[iMbXy][12], 0);
       }
     } else { //non-MB_TYPE_INTRA16x16
-      iMbResProperty = (IS_INTRA (pCurLayer->pMbType[iMbXy])) ? LUMA_DC_AC_INTRA : LUMA_DC_AC_INTER;
-      for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
-        if (uiCbpLuma & (1 << iId8x8)) {
-          int32_t iIdx = (iId8x8 << 2);
-          for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
-            //Luma (DC and AC decoding together)
-            WELS_READ_VERIFY (ParseResidualBlockCabac (pNeighAvail, pNonZeroCount, pBsAux, iIdx, iScanIdxEnd - iScanIdxStart + 1,
-                              g_kuiZigzagScan + iScanIdxStart, iMbResProperty, pCurLayer->pScaledTCoeff[iMbXy] + (iIdx << 4),
-                              pCurLayer->pLumaQp[iMbXy],
-                              pCtx));
-            iIdx++;
+      if (pCtx->pCurDqLayer->pTransformSize8x8Flag[iMbXy]) {
+        // Transform 8x8 support for CABAC
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          if (uiCbpLuma & (1 << iId8x8)) {
+            WELS_READ_VERIFY (ParseResidualBlockCabac8x8 (pNeighAvail, pNonZeroCount, pBsAux, (iId8x8 << 2),
+                              iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan8x8 + iScanIdxStart,
+                              IS_INTRA (pCurLayer->pMbType[iMbXy]) ? LUMA_DC_AC_INTRA_8 : LUMA_DC_AC_INTER_8,
+                              pCurLayer->pScaledTCoeff[iMbXy] + (iId8x8 << 6), pCurLayer->pLumaQp[iMbXy], pCtx));
+          } else {
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2)]], 0);
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
           }
-        } else {
-          ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[iId8x8 << 2]], 0);
-          ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
         }
+        ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+      } else {
+        iMbResProperty = (IS_INTRA (pCurLayer->pMbType[iMbXy])) ? LUMA_DC_AC_INTRA : LUMA_DC_AC_INTER;
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          if (uiCbpLuma & (1 << iId8x8)) {
+            int32_t iIdx = (iId8x8 << 2);
+            for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+              //Luma (DC and AC decoding together)
+              WELS_READ_VERIFY (ParseResidualBlockCabac (pNeighAvail, pNonZeroCount, pBsAux, iIdx, iScanIdxEnd - iScanIdxStart + 1,
+                                g_kuiZigzagScan + iScanIdxStart, iMbResProperty, pCurLayer->pScaledTCoeff[iMbXy] + (iIdx << 4),
+                                pCurLayer->pLumaQp[iMbXy],
+                                pCtx));
+              iIdx++;
+            }
+          } else {
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[iId8x8 << 2]], 0);
+            ST16 (&pNonZeroCount[g_kCacheNzcScanIdx[ (iId8x8 << 2) + 2]], 0);
+          }
+        }
+        ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
       }
-      ST32 (&pCurLayer->pNzc[iMbXy][0], LD32 (&pNonZeroCount[1 + 8 * 1]));
-      ST32 (&pCurLayer->pNzc[iMbXy][4], LD32 (&pNonZeroCount[1 + 8 * 2]));
-      ST32 (&pCurLayer->pNzc[iMbXy][8], LD32 (&pNonZeroCount[1 + 8 * 3]));
-      ST32 (&pCurLayer->pNzc[iMbXy][12], LD32 (&pNonZeroCount[1 + 8 * 4]));
     }
 
     //chroma
@@ -886,8 +1060,12 @@
   pCurLayer->pCbfDc[iMbXy] = 0;
   pCurLayer->pChromaPredMode[iMbXy] = C_PRED_DC;
 
+  pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
+  pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+
   GetNeighborAvailMbType (&uiNeighAvail, pCurLayer);
   WELS_READ_VERIFY (ParseSkipFlagCabac (pCtx, &uiNeighAvail, uiCode));
+
   if (uiCode) {
     int16_t pMv[2] = {0};
     pCurLayer->pMbType[iMbXy] = MB_TYPE_SKIP;
@@ -943,30 +1121,23 @@
           //if (!pCtx->pSps->bSeqScalingListPresentFlag[i]) {
           if (!pCtx->pPps->bPicScalingListPresentFlag[i]) {
             if (i < 6) {
-
-
               if (i == 0 || i == 3)
                 memcpy (pCtx->pPps->iScalingList4x4[i], pCtx->pSps->iScalingList4x4[i], 16 * sizeof (uint8_t));
               else
                 memcpy (pCtx->pPps->iScalingList4x4[i], pCtx->pPps->iScalingList4x4[i - 1], 16 * sizeof (uint8_t));
             } else {
-
               if (i == 6 || i == 7)
                 memcpy (pCtx->pPps->iScalingList8x8[ i - 6 ], pCtx->pSps->iScalingList8x8[ i - 6 ], 64 * sizeof (uint8_t));
               else
                 memcpy (pCtx->pPps->iScalingList8x8[ i - 6 ], pCtx->pPps->iScalingList8x8[i - 8], 64 * sizeof (uint8_t));
-
-
             }
-
           }
         }
-
-
       }
       //Init dequant coeff value for different QP
       for (i = 0; i < 6; i++) {
         pCtx->pDequant_coeff4x4[i] = pCtx->pDequant_coeff_buffer4x4[i];
+        pCtx->pDequant_coeff8x8[i] = pCtx->pDequant_coeff_buffer8x8[i];
         for (q = 0; q < 51; q++) {
           for (x = 0; x < 16; x++) {
             pCtx->pDequant_coeff4x4[i][q][x] = pCtx->pPps->bPicScalingMatrixPresentFlag ? pCtx->pPps->iScalingList4x4[i][x] *
@@ -974,13 +1145,10 @@
           }
           for (y = 0; y < 64; y++) {
             pCtx->pDequant_coeff8x8[i][q][y] = pCtx->pPps->bPicScalingMatrixPresentFlag ? pCtx->pPps->iScalingList8x8[i][y] *
-                                               g_kuiDequantCoeff[q][x & 0x07] : pCtx->pSps->iScalingList8x8[i][y] * g_kuiDequantCoeff[q][x &
-                                                   0x07];//pseudo-code ,holding for 8x8transform into
+                                               g_kuiMatrixV[q % 6][y / 8][y % 8] : pCtx->pSps->iScalingList8x8[i][y] * g_kuiMatrixV[q % 6][y / 8][y % 8];
           }
         }
       }
-
-
       pCtx->bDequantCoeff4x4Init = true;
       pCtx->iDequantCoeffPpsid = pCtx->pPps->iPpsId;
     }
@@ -1027,12 +1195,12 @@
   }
 
   if (pSliceHeader->pPps->bConstainedIntraPredFlag) {
-    pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain1Intra4x4;
-    pCtx->pMap4x4NeighToSampleFunc    = WelsMap4x4NeighToSampleConstrain1;
+    pCtx->pFillInfoCacheIntraNxNFunc = WelsFillCacheConstrain1IntraNxN;
+    pCtx->pMapNxNNeighToSampleFunc    = WelsMapNxNNeighToSampleConstrain1;
     pCtx->pMap16x16NeighToSampleFunc  = WelsMap16x16NeighToSampleConstrain1;
   } else {
-    pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain0Intra4x4;
-    pCtx->pMap4x4NeighToSampleFunc    = WelsMap4x4NeighToSampleNormal;
+    pCtx->pFillInfoCacheIntraNxNFunc = WelsFillCacheConstrain0IntraNxN;
+    pCtx->pMapNxNNeighToSampleFunc    = WelsMapNxNNeighToSampleNormal;
     pCtx->pMap16x16NeighToSampleFunc  = WelsMap16x16NeighToSampleNormal;
   }
 
@@ -1117,6 +1285,9 @@
   pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
   pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
 
+  pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
+  pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+
   WELS_READ_VERIFY (BsGetUe (pBs, &uiCode)); //uiMbType
   uiMbType = uiCode;
   if (uiMbType > 25)
@@ -1178,8 +1349,20 @@
   } else if (0 == uiMbType) { //reference to JM
     ENFORCE_STACK_ALIGN_1D (int8_t, pIntraPredMode, 48, 16);
     pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
-    pCtx->pFillInfoCacheIntra4x4Func (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
-    WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, &sNeighAvail, pIntraPredMode, pBs, pCurLayer));
+    if (pCtx->pPps->bTransform8x8ModeFlag) {
+      WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //transform_size_8x8_flag
+      pCurLayer->pTransformSize8x8Flag[iMbXy] = !!uiCode;
+      if (pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+        uiMbType = pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA8x8;
+      }
+    }
+    if (!pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+      pCtx->pFillInfoCacheIntraNxNFunc (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+      WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, &sNeighAvail, pIntraPredMode, pBs, pCurLayer));
+    } else {
+      pCtx->pFillInfoCacheIntraNxNFunc (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+      WELS_READ_VERIFY (ParseIntra8x8Mode (pCtx, &sNeighAvail, pIntraPredMode, pBs, pCurLayer));
+    }
 
     //uiCbp
     WELS_READ_VERIFY (BsGetUe (pBs, &uiCode)); //coded_block_pattern
@@ -1199,6 +1382,8 @@
     uiCbpL = uiCbp & 15;
   } else { //I_PCM exclude, we can ignore it
     pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+    pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+    pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
     pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType - 1) & 3;
     pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[ (uiMbType - 1) >> 2];
     uiCbpC = pCtx->pSps->uiChromaFormatIdc ? pCurLayer->pCbp[iMbXy] >> 4 : 0;
@@ -1266,27 +1451,51 @@
         ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
       }
     } else { //non-MB_TYPE_INTRA16x16
-      for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
-        if (uiCbpL & (1 << iId8x8)) {
-          int32_t iIndex = (iId8x8 << 2);
-          for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
-            //Luma (DC and AC decoding together)
-            if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
-                                        iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan + iScanIdxStart,
-                                        LUMA_DC_AC_INTRA, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), pCurLayer->pLumaQp[iMbXy], pCtx)) {
-              return -1;//abnormal
+      if (pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          iMbResProperty = (IS_INTRA (pCurLayer->pMbType[iMbXy])) ? LUMA_DC_AC_INTRA : LUMA_DC_AC_INTER;
+          if (uiCbpL & (1 << iId8x8)) {
+            int32_t iIndex = (iId8x8 << 2);
+            for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+              if (WelsResidualBlockCavlc8x8 (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                             iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan8x8 + iScanIdxStart, iMbResProperty,
+                                             pCurLayer->pScaledTCoeff[iMbXy] + (iId8x8 << 6), iId4x4, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+                return -1;
+              }
+              iIndex++;
             }
-            iIndex++;
+          } else {
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[iId8x8 << 2]], 0);
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2) + 2]], 0);
           }
-        } else {
-          ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2)]], 0);
-          ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2) + 2]], 0);
         }
+        ST32A4 (&pNzc[0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32A4 (&pNzc[4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32A4 (&pNzc[8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+      } else {
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          if (uiCbpL & (1 << iId8x8)) {
+            int32_t iIndex = (iId8x8 << 2);
+            for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+              //Luma (DC and AC decoding together)
+              if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                          iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan + iScanIdxStart,
+                                          LUMA_DC_AC_INTRA, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), pCurLayer->pLumaQp[iMbXy], pCtx)) {
+                return -1;//abnormal
+              }
+              iIndex++;
+            }
+          } else {
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2)]], 0);
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2) + 2]], 0);
+          }
+        }
+        ST32A4 (&pNzc[0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32A4 (&pNzc[4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32A4 (&pNzc[8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
       }
-      ST32A4 (&pNzc[0], LD32 (&pNonZeroCount[1 + 8 * 1]));
-      ST32A4 (&pNzc[4], LD32 (&pNonZeroCount[1 + 8 * 2]));
-      ST32A4 (&pNzc[8], LD32 (&pNonZeroCount[1 + 8 * 3]));
-      ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
     }
 
     //chroma
@@ -1399,6 +1608,7 @@
     int8_t	iRefIndex[LIST_A][30];
     pCurLayer->pMbType[iMbXy] = g_ksInterMbTypeInfo[uiMbType].iType;
     WelsFillCacheInter (&sNeighAvail, pNonZeroCount, iMotionVector, iRefIndex, pCurLayer);
+
     if (ParseInterInfo (pCtx, iMotionVector, iRefIndex, pBs)) {
       return -1;//abnormal
     }
@@ -1484,12 +1694,24 @@
       if (0 == uiMbType) {
         ENFORCE_STACK_ALIGN_1D (int8_t, pIntraPredMode, 48, 16);
         pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
-        pCtx->pFillInfoCacheIntra4x4Func (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
-        if (ParseIntra4x4Mode (pCtx, &sNeighAvail, pIntraPredMode, pBs, pCurLayer)) {
-          return -1;
+        if (pCtx->pPps->bTransform8x8ModeFlag) {
+          WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //transform_size_8x8_flag
+          pCurLayer->pTransformSize8x8Flag[iMbXy] = !!uiCode;
+          if (pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+            uiMbType = pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA8x8;
+          }
         }
+        if (!pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+          pCtx->pFillInfoCacheIntraNxNFunc (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+          WELS_READ_VERIFY (ParseIntra4x4Mode (pCtx, &sNeighAvail, pIntraPredMode, pBs, pCurLayer));
+        } else {
+          pCtx->pFillInfoCacheIntraNxNFunc (&sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer);
+          WELS_READ_VERIFY (ParseIntra8x8Mode (pCtx, &sNeighAvail, pIntraPredMode, pBs, pCurLayer));
+        }
       } else { //I_PCM exclude, we can ignore it
         pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+        pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
+        pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
         pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType - 1) & 3;
         pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[ (uiMbType - 1) >> 2];
         uiCbpC = pCtx->pSps->uiChromaFormatIdc ? pCurLayer->pCbp[iMbXy] >> 4 : 0;
@@ -1510,7 +1732,7 @@
         return ERR_INFO_INVALID_CBP;
       if (!pCtx->pSps->uiChromaFormatIdc && (uiCbp > 15))
         return ERR_INFO_INVALID_CBP;
-      if (MB_TYPE_INTRA4x4 == pCurLayer->pMbType[iMbXy]) {
+      if (MB_TYPE_INTRA4x4 == pCurLayer->pMbType[iMbXy] || MB_TYPE_INTRA8x8 == pCurLayer->pMbType[iMbXy]) {
 
         uiCbp = pCtx->pSps->uiChromaFormatIdc ? g_kuiIntra4x4CbpTable[uiCbp] : g_kuiIntra4x4CbpTable400[uiCbp];
       } else //inter
@@ -1520,6 +1742,20 @@
     pCurLayer->pCbp[iMbXy] = uiCbp;
     uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
     uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+
+    // Need modification when B picutre add in
+    bool bNeedParseTransformSize8x8Flag =
+      (((pCurLayer->pMbType[iMbXy] >= MB_TYPE_16x16 && pCurLayer->pMbType[iMbXy] <= MB_TYPE_8x16)
+        || pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy])
+       && (pCurLayer->pMbType[iMbXy] != MB_TYPE_INTRA8x8)
+       && (pCurLayer->pMbType[iMbXy] != MB_TYPE_INTRA4x4)
+       && (uiCbpL > 0)
+       && (pCtx->pPps->bTransform8x8ModeFlag));
+
+    if (bNeedParseTransformSize8x8Flag) {
+      WELS_READ_VERIFY (BsGetOneBit (pBs, &uiCode)); //transform_size_8x8_flag
+      pCurLayer->pTransformSize8x8Flag[iMbXy] = !!uiCode;
+    }
   }
 
   ST32A4 (&pNzc[0], 0);
@@ -1577,28 +1813,52 @@
         ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
       }
     } else { //non-MB_TYPE_INTRA16x16
-      for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
-        iMbResProperty = (IS_INTRA (pCurLayer->pMbType[iMbXy])) ? LUMA_DC_AC_INTRA : LUMA_DC_AC_INTER;
-        if (uiCbpL & (1 << iId8x8)) {
-          int32_t iIndex = (iId8x8 << 2);
-          for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
-            //Luma (DC and AC decoding together)
-            if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
-                                        iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan + iScanIdxStart, iMbResProperty,
-                                        pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), pCurLayer->pLumaQp[iMbXy], pCtx)) {
-              return -1;//abnormal
+      if (pCurLayer->pTransformSize8x8Flag[iMbXy]) {
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          iMbResProperty = (IS_INTRA (pCurLayer->pMbType[iMbXy])) ? LUMA_DC_AC_INTRA : LUMA_DC_AC_INTER;
+          if (uiCbpL & (1 << iId8x8)) {
+            int32_t iIndex = (iId8x8 << 2);
+            for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+              if (WelsResidualBlockCavlc8x8 (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                             iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan8x8 + iScanIdxStart, iMbResProperty,
+                                             pCurLayer->pScaledTCoeff[iMbXy] + (iId8x8 << 6), iId4x4, pCurLayer->pLumaQp[iMbXy], pCtx)) {
+                return -1;
+              }
+              iIndex++;
             }
-            iIndex++;
+          } else {
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[iId8x8 << 2]], 0);
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2) + 2]], 0);
           }
-        } else {
-          ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[iId8x8 << 2]], 0);
-          ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2) + 2]], 0);
         }
+        ST32A4 (&pNzc[0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32A4 (&pNzc[4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32A4 (&pNzc[8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
+      } else { // Normal T4x4
+        for (iId8x8 = 0; iId8x8 < 4; iId8x8++) {
+          iMbResProperty = (IS_INTRA (pCurLayer->pMbType[iMbXy])) ? LUMA_DC_AC_INTRA : LUMA_DC_AC_INTER;
+          if (uiCbpL & (1 << iId8x8)) {
+            int32_t iIndex = (iId8x8 << 2);
+            for (iId4x4 = 0; iId4x4 < 4; iId4x4++) {
+              //Luma (DC and AC decoding together)
+              if (WelsResidualBlockCavlc (pVlcTable, pNonZeroCount, pBs, iIndex,
+                                          iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan + iScanIdxStart, iMbResProperty,
+                                          pCurLayer->pScaledTCoeff[iMbXy] + (iIndex << 4), pCurLayer->pLumaQp[iMbXy], pCtx)) {
+                return -1;//abnormal
+              }
+              iIndex++;
+            }
+          } else {
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[iId8x8 << 2]], 0);
+            ST16 (&pNonZeroCount[g_kuiCache48CountScan4Idx[ (iId8x8 << 2) + 2]], 0);
+          }
+        }
+        ST32A4 (&pNzc[0], LD32 (&pNonZeroCount[1 + 8 * 1]));
+        ST32A4 (&pNzc[4], LD32 (&pNonZeroCount[1 + 8 * 2]));
+        ST32A4 (&pNzc[8], LD32 (&pNonZeroCount[1 + 8 * 3]));
+        ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
       }
-      ST32A4 (&pNzc[0], LD32 (&pNonZeroCount[1 + 8 * 1]));
-      ST32A4 (&pNzc[4], LD32 (&pNonZeroCount[1 + 8 * 2]));
-      ST32A4 (&pNzc[8], LD32 (&pNonZeroCount[1 + 8 * 3]));
-      ST32A4 (&pNzc[12], LD32 (&pNonZeroCount[1 + 8 * 4]));
     }
 
 
@@ -1660,6 +1920,9 @@
   int32_t iBaseModeFlag, i;
   int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
   uint32_t uiCode;
+
+  pCurLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] = true;
+  pCurLayer->pTransformSize8x8Flag[iMbXy] = false;
 
   if (-1 == pSlice->iMbSkipRun) {
     WELS_READ_VERIFY (BsGetUe (pBs, &uiCode)); //mb_skip_run
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -862,6 +862,21 @@
   pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU    ] = WelsI4x4LumaPredHU_c;
   pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD    ] = WelsI4x4LumaPredHD_c;
 
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_V     ] = WelsI8x8LumaPredV_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_H     ] = WelsI8x8LumaPredH_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DC    ] = WelsI8x8LumaPredDc_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DC_L  ] = WelsI8x8LumaPredDcLeft_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DC_T  ] = WelsI8x8LumaPredDcTop_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DC_128] = WelsI8x8LumaPredDcNA_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DDL    ] = WelsI8x8LumaPredDDL_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DDL_TOP] = WelsI8x8LumaPredDDLTop_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_DDR    ] = WelsI8x8LumaPredDDR_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_VL    ] = WelsI8x8LumaPredVL_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_VL_TOP] = WelsI8x8LumaPredVLTop_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_VR    ] = WelsI8x8LumaPredVR_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_HU    ] = WelsI8x8LumaPredHU_c;
+  pCtx->pGetI8x8LumaPredFunc[I4_PRED_HD    ] = WelsI8x8LumaPredHD_c;
+
   pCtx->pGetIChromaPredFunc[C_PRED_DC    ] = WelsIChromaPredDc_c;
   pCtx->pGetIChromaPredFunc[C_PRED_H     ] = WelsIChromaPredH_c;
   pCtx->pGetIChromaPredFunc[C_PRED_V     ] = WelsIChromaPredV_c;
@@ -872,6 +887,8 @@
 
   pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
 
+  pCtx->pIdctResAddPredFunc8x8 = IdctResAddPred8x8_c;
+
 #if defined(HAVE_NEON)
   if (pCtx->uiCpuFlag & WELS_CPU_NEON) {
     pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;
@@ -931,7 +948,7 @@
   if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) {
     pCtx->pIdctResAddPredFunc	= IdctResAddPred_mmx;
 
-    /////////mmx code opt---
+    ///////mmx code opt---
     pCtx->pGetIChromaPredFunc[C_PRED_H]      = WelsDecoderIChromaPredH_mmx;
     pCtx->pGetIChromaPredFunc[C_PRED_V]      = WelsDecoderIChromaPredV_mmx;
     pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsDecoderIChromaPredDcLeft_mmx;
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -1238,11 +1238,16 @@
                                   int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[][]");
     pCtx->sMb.pLumaQp[i] = (int8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                            "pCtx->sMb.pLumaQp[]");
+    pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i] = (bool*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+          bool),
+        "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
+    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+                                         "pCtx->sMb.pTransformSize8x8Flag[]");
     pCtx->sMb.pChromaQp[i] = (int8_t (*)[2])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 2,
                              "pCtx->sMb.pChromaQp[]");
     pCtx->sMb.pMvd[i][0] = (int16_t (*)[16][2])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
                              int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMvd[][]");
-    pCtx->sMb.pCbfDc[i] = (uint8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (uint8_t),
+    pCtx->sMb.pCbfDc[i] = (uint16_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (uint16_t),
                           "pCtx->sMb.pCbfDc[]");
     pCtx->sMb.pNzc[i] = (int8_t (*)[24])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
                         "pCtx->sMb.pNzc[]");
@@ -1255,6 +1260,8 @@
                                   "pCtx->sMb.pIntraPredMode[]");
     pCtx->sMb.pIntra4x4FinalMode[i] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
                                       sizeof (int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pIntra4x4FinalMode[]");
+    pCtx->sMb.pIntraNxNAvailFlag[i] = (uint8_t (*))WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+                                      "pCtx->sMb.pIntraNxNAvailFlag");
     pCtx->sMb.pChromaPredMode[i] = (int8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                                    "pCtx->sMb.pChromaPredMode[]");
     pCtx->sMb.pCbp[i] = (int8_t*)WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
@@ -1341,6 +1348,18 @@
       pCtx->sMb.pRefIndex[i][0] = NULL;
     }
 
+    if (pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i]) {
+      WelsFree (pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i], "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
+
+      pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[i] = NULL;
+    }
+
+    if (pCtx->sMb.pTransformSize8x8Flag[i]) {
+      WelsFree (pCtx->sMb.pTransformSize8x8Flag[i], "pCtx->sMb.pTransformSize8x8Flag[]");
+
+      pCtx->sMb.pTransformSize8x8Flag[i] = NULL;
+    }
+
     if (pCtx->sMb.pLumaQp[i]) {
       WelsFree (pCtx->sMb.pLumaQp[i], "pCtx->sMb.pLumaQp[]");
 
@@ -1393,6 +1412,12 @@
       pCtx->sMb.pIntra4x4FinalMode[i] = NULL;
     }
 
+    if (pCtx->sMb.pIntraNxNAvailFlag[i]) {
+      WelsFree (pCtx->sMb.pIntraNxNAvailFlag[i], "pCtx->sMb.pIntraNxNAvailFlag");
+
+      pCtx->sMb.pIntraNxNAvailFlag[i] = NULL;
+    }
+
     if (pCtx->sMb.pChromaPredMode[i]) {
       WelsFree (pCtx->sMb.pChromaPredMode[i], "pCtx->sMb.pChromaPredMode[]");
 
@@ -1989,7 +2014,7 @@
   if (kuiQualityId == BASE_QUALITY_ID) {
     pDqLayer->pRefPicListReordering		= &pSh->pRefPicListReordering;
     pDqLayer->pRefPicMarking		= &pSh->sRefMarking;
-    
+
     if (pSh->pPps->bWeightedPredFlag) {
       pDqLayer->bUseWeightPredictionFlag = true;
       pDqLayer->pPredWeightTable    = &pSh->sPredWeightTable;
@@ -2029,6 +2054,8 @@
     pCurDq->pSliceIdc		= pCtx->sMb.pSliceIdc[0];
     pCurDq->pMv[0]			= pCtx->sMb.pMv[0][0];
     pCurDq->pRefIndex[0]    = pCtx->sMb.pRefIndex[0][0];
+    pCurDq->pNoSubMbPartSizeLessThan8x8Flag = pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[0];
+    pCurDq->pTransformSize8x8Flag = pCtx->sMb.pTransformSize8x8Flag[0];
     pCurDq->pLumaQp         = pCtx->sMb.pLumaQp[0];
     pCurDq->pChromaQp       = pCtx->sMb.pChromaQp[0];
     pCurDq->pMvd[0]       = pCtx->sMb.pMvd[0][0];
@@ -2038,6 +2065,7 @@
     pCurDq->pScaledTCoeff   = pCtx->sMb.pScaledTCoeff[0];
     pCurDq->pIntraPredMode  = pCtx->sMb.pIntraPredMode[0];
     pCurDq->pIntra4x4FinalMode = pCtx->sMb.pIntra4x4FinalMode[0];
+    pCurDq->pIntraNxNAvailFlag = pCtx->sMb.pIntraNxNAvailFlag[0];
     pCurDq->pChromaPredMode = pCtx->sMb.pChromaPredMode[0];
     pCurDq->pCbp            = pCtx->sMb.pCbp[0];
     pCurDq->pSubMbType      = pCtx->sMb.pSubMbType[0];
--- a/codec/decoder/core/src/get_intra_predictor.cpp
+++ b/codec/decoder/core/src/get_intra_predictor.cpp
@@ -380,6 +380,507 @@
   ST32A4 (pPred + kiStride3, LD32 (kuiList));
 }
 
+void WelsI8x8LumaPredV_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  uint64_t uiTop = 0;
+  int32_t iStride[8];
+  uint8_t uiPixelFilterT[8];
+  int32_t i;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
+                        pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+
+  // 8-89
+  for (i = 7; i >= 0; i--) {
+    uiTop = ((uiTop << 8) | uiPixelFilterT[i]);
+  }
+
+  for (i = 0; i < 8; i++) {
+    ST64A8 (pPred + kiStride * i, uiTop);
+  }
+}
+
+void WelsI8x8LumaPredH_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  uint64_t uiLeft;
+  int32_t iStride[8];
+  uint8_t uiPixelFilterL[8];
+  int32_t i;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
+                        pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+
+  // 8-90
+  for (i = 0; i < 8; i++) {
+    uiLeft = 0x0101010101010101U * uiPixelFilterL[i];
+    ST64A8 (pPred + iStride[i], uiLeft);
+  }
+}
+
+void WelsI8x8LumaPredDc_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  int32_t iStride[8];
+  uint8_t uiPixelFilterL[8];
+  uint8_t uiPixelFilterT[8];
+  uint16_t uiTotal = 0;
+  int32_t i;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
+                        pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+  uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
+                        pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+
+  // 8-91
+  for (i = 0; i < 8; i++) {
+    uiTotal += uiPixelFilterL[i];
+    uiTotal += uiPixelFilterT[i];
+  }
+
+  const uint8_t kuiMean = ((uiTotal + 8) >> 4);
+  const uint64_t kuiMean64 = 0x0101010101010101U * kuiMean;
+
+  for (i = 0; i < 8; i++) {
+    ST64A8 (pPred + iStride[i], kuiMean64);
+  }
+}
+
+void WelsI8x8LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  int32_t iStride[8];
+  uint8_t uiPixelFilterL[8];
+  uint16_t uiTotal = 0;
+  int32_t i;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
+                        pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+
+  // 8-92
+  for (i = 0; i < 8; i++) {
+    uiTotal += uiPixelFilterL[i];
+  }
+
+  const uint8_t kuiMean = ((uiTotal + 4) >> 3);
+  const uint64_t kuiMean64 = 0x0101010101010101U * kuiMean;
+
+  for (i = 0; i < 8; i++) {
+    ST64A8 (pPred + iStride[i], kuiMean64);
+  }
+}
+
+void WelsI8x8LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  int32_t iStride[8];
+  uint8_t uiPixelFilterT[8];
+  uint16_t uiTotal = 0;
+  int32_t i;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
+                        pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+
+  // 8-93
+  for (i = 0; i < 8; i++) {
+    uiTotal += uiPixelFilterT[i];
+  }
+
+  const uint8_t kuiMean = ((uiTotal + 4) >> 3);
+  const uint64_t kuiMean64 = 0x0101010101010101U * kuiMean;
+
+  for (i = 0; i < 8; i++) {
+    ST64A8 (pPred + iStride[i], kuiMean64);
+  }
+}
+
+void WelsI8x8LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // for normal 8 bit depth, 8-94
+  const uint64_t kuiDC64		= 0x8080808080808080U;
+
+  int32_t iStride[8];
+  int32_t i;
+  ST64A8 (pPred, kuiDC64);
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+    ST64A8 (pPred + iStride[i], kuiDC64);
+  }
+}
+
+/*down pLeft*/
+void WelsI8x8LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // Top and Top-right available
+  int32_t iStride[8];
+  uint8_t uiPixelFilterT[16];
+  int32_t i, j;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 15; i++) {
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterT[15] = ((pPred[14 - kiStride] + pPred[15 - kiStride] * 3 + 2) >> 2);
+
+  for (i = 0; i < 8; i++) { // y
+    for (j = 0; j < 8; j++) { // x
+      if (i == 7 && j == 7) { // 8-95
+        pPred[j + iStride[i]] = (uiPixelFilterT[14] + 3 * uiPixelFilterT[15] + 2) >> 2;
+      } else { // 8-96
+        pPred[j + iStride[i]] = (uiPixelFilterT[i + j] + (uiPixelFilterT[i + j + 1] << 1) + uiPixelFilterT[i + j + 2] + 2) >> 2;
+      }
+    }
+  }
+}
+
+/*down pLeft*/
+void WelsI8x8LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // Top available and Top-right unavailable
+  int32_t iStride[8];
+  uint8_t uiPixelFilterT[16];
+  int32_t i, j;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  // p[x, -1] x=8...15 are replaced with p[7, -1]
+  uiPixelFilterT[7] = ((pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+  for (i = 8; i < 16; i++) {
+    uiPixelFilterT[i] = pPred[7 - kiStride];
+  }
+
+  for (i = 0; i < 8; i++) { // y
+    for (j = 0; j < 8; j++) { // x
+      if (i == 7 && j == 7) { // 8-95
+        pPred[j + iStride[i]] = (uiPixelFilterT[14] + 3 * uiPixelFilterT[15] + 2) >> 2;
+      } else { // 8-96
+        pPred[j + iStride[i]] = (uiPixelFilterT[i + j] + (uiPixelFilterT[i + j + 1] << 1) + uiPixelFilterT[i + j + 2] + 2) >> 2;
+      }
+    }
+  }
+}
+
+/*down right*/
+void WelsI8x8LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // The TopLeft, Top, Left are all available under this mode
+  int32_t iStride[8];
+  uint8_t uiPixelFilterTL;
+  uint8_t uiPixelFilterL[8];
+  uint8_t uiPixelFilterT[8];
+  int32_t i, j;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
+
+  uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
+  uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+  uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
+                        pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+
+  for (i = 0; i < 8; i++) { // y
+    // 8-98, x < y-1
+    for (j = 0; j < (i - 1); j++) {
+      pPred[j + iStride[i]] = (uiPixelFilterL[i - j - 2] + (uiPixelFilterL[i - j - 1] << 1) + uiPixelFilterL[i - j] + 2) >> 2;
+    }
+    // 8-98, special case, x == y-1
+    if (i >= 1) {
+      j = i - 1;
+      pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterL[0] << 1) + uiPixelFilterL[1] + 2) >> 2;
+    }
+    // 8-99, x==y
+    j = i;
+    pPred[j + iStride[i]] = (uiPixelFilterT[0] + (uiPixelFilterTL << 1) + uiPixelFilterL[0] + 2) >> 2;
+    // 8-97, special case, x == y+1
+    if (i < 7) {
+      j = i + 1;
+      pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterT[0] << 1) + uiPixelFilterT[1] + 2) >> 2;
+    }
+    for (j = i + 2; j < 8; j++) { // 8-97, x > y+1
+      pPred[j + iStride[i]] = (uiPixelFilterT[j - i - 2] + (uiPixelFilterT[j - i - 1] << 1) + uiPixelFilterT[j - i] + 2) >> 2;
+    }
+  }
+}
+
+/*vertical pLeft*/
+void WelsI8x8LumaPredVL_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // Top and Top-right available
+  int32_t iStride[8];
+  uint8_t uiPixelFilterT[16];
+  int32_t i, j;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 15; i++) {
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterT[15] = ((pPred[14 - kiStride] + pPred[15 - kiStride] * 3 + 2) >> 2);
+
+  for (i = 0; i < 8; i++) { // y
+    if ((i & 0x01) == 0) { // 8-108
+      for (j = 0; j < 8; j++) { // x
+        pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + uiPixelFilterT[j + (i >> 1) + 1] + 1) >> 1;
+      }
+    } else {  // 8-109
+      for (j = 0; j < 8; j++) { // x
+        pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + (uiPixelFilterT[j + (i >> 1) + 1] << 1) + uiPixelFilterT[j +
+                                 (i >> 1) + 2] + 2) >> 2;
+      }
+    }
+  }
+}
+
+/*vertical pLeft*/
+void WelsI8x8LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // Top available and Top-right unavailable
+  int32_t iStride[8];
+  uint8_t uiPixelFilterT[16];
+  int32_t i, j;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterT[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2) : ((
+                        pPred[-kiStride] * 3 + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  // p[x, -1] x=8...15 are replaced with p[7, -1]
+  uiPixelFilterT[7] = ((pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+  for (i = 8; i < 16; i++) {
+    uiPixelFilterT[i] = pPred[7 - kiStride];
+  }
+
+  for (i = 0; i < 8; i++) { // y
+    if ((i & 0x01) == 0) { // 8-108
+      for (j = 0; j < 8; j++) { // x
+        pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + uiPixelFilterT[j + (i >> 1) + 1] + 1) >> 1;
+      }
+    } else {  // 8-109
+      for (j = 0; j < 8; j++) { // x
+        pPred[j + iStride[i]] = (uiPixelFilterT[j + (i >> 1)] + (uiPixelFilterT[j + (i >> 1) + 1] << 1) + uiPixelFilterT[j +
+                                 (i >> 1) + 2] + 2) >> 2;
+      }
+    }
+  }
+}
+
+/*vertical right*/
+void WelsI8x8LumaPredVR_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // The TopLeft, Top, Left are always available under this mode
+  int32_t iStride[8];
+  uint8_t uiPixelFilterTL;
+  uint8_t uiPixelFilterL[8];
+  uint8_t uiPixelFilterT[8];
+  int32_t i, j;
+  int32_t izVR, izVRDiv;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
+
+  uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
+  uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+  uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
+                        pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+
+  for (i = 0; i < 8; i++) { // y
+    for (j = 0; j < 8; j++) { // x
+      izVR = (j << 1) - i; // 2 * x - y
+      izVRDiv = j - (i >> 1);
+      if (izVR >= 0) {
+        if ((izVR & 0x01) == 0) {  // 8-100
+          if (izVRDiv > 0) {
+            pPred[j + iStride[i]] = (uiPixelFilterT[izVRDiv - 1] + uiPixelFilterT[izVRDiv] + 1) >> 1;
+          } else {
+            pPred[j + iStride[i]] = (uiPixelFilterTL + uiPixelFilterT[0] + 1) >> 1;
+          }
+        } else { // 8-101
+          if (izVRDiv > 1) {
+            pPred[j + iStride[i]] = (uiPixelFilterT[izVRDiv - 2] + (uiPixelFilterT[izVRDiv - 1] << 1) + uiPixelFilterT[izVRDiv] + 2)
+                                    >> 2;
+          } else {
+            pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterT[0] << 1) + uiPixelFilterT[1] + 2) >> 2;
+          }
+        }
+      } else if (izVR == -1) { // 8-102
+        pPred[j + iStride[i]] = (uiPixelFilterL[0] + (uiPixelFilterTL << 1) + uiPixelFilterT[0] + 2) >> 2;
+      } else if (izVR < -2) { // 8-103
+        pPred[j + iStride[i]] = (uiPixelFilterL[-izVR - 1] + (uiPixelFilterL[-izVR - 2] << 1) + uiPixelFilterL[-izVR - 3] + 2)
+                                >> 2;
+      } else { // izVR==-2, 8-103, special case
+        pPred[j + iStride[i]] = (uiPixelFilterL[1] + (uiPixelFilterL[0] << 1) + uiPixelFilterTL + 2) >> 2;
+      }
+    }
+  }
+}
+
+/*horizontal up*/
+void WelsI8x8LumaPredHU_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  int32_t iStride[8];
+  uint8_t uiPixelFilterL[8];
+  int32_t i, j;
+  int32_t izHU;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterL[0] = bTLAvail ? ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2) : ((
+                        pPred[-1] * 3 + pPred[-1 + iStride[1]] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+
+  for (i = 0; i < 8; i++) { // y
+    for (j = 0; j < 8; j++) { // x
+      izHU = j + (i << 1); // x + 2 * y
+      if (izHU < 13) {
+        if ((izHU & 0x01) == 0) {  // 8-110
+          pPred[j + iStride[i]] = (uiPixelFilterL[izHU >> 1] + uiPixelFilterL[1 + (izHU >> 1)] + 1) >> 1;
+        } else { // 8-111
+          pPred[j + iStride[i]] = (uiPixelFilterL[izHU >> 1] + (uiPixelFilterL[1 + (izHU >> 1)] << 1) + uiPixelFilterL[2 +
+                                   (izHU >> 1)] + 2) >> 2;
+        }
+      } else if (izHU == 13) { // 8-112
+        pPred[j + iStride[i]] = (uiPixelFilterL[6] + 3 * uiPixelFilterL[7] + 2) >> 2;
+      } else { // 8-113
+        pPred[j + iStride[i]] = uiPixelFilterL[7];
+      }
+    }
+  }
+}
+
+/*horizontal down*/
+void WelsI8x8LumaPredHD_c (uint8_t* pPred, const int32_t kiStride, bool bTLAvail, bool bTRAvail) {
+  // The TopLeft, Top, Left are all available under this mode
+  int32_t iStride[8];
+  uint8_t uiPixelFilterTL;
+  uint8_t uiPixelFilterL[8];
+  uint8_t uiPixelFilterT[8];
+  int32_t i, j;
+  int32_t izHD, izHDDiv;
+
+  for (iStride[0] = 0, i = 1; i < 8; i++) {
+    iStride[i] = iStride[i - 1] + kiStride;
+  }
+
+  uiPixelFilterTL = (pPred[-1] + (pPred[-1 - kiStride] << 1) + pPred[-kiStride] + 2) >> 2;
+
+  uiPixelFilterL[0] = ((pPred[-1 - kiStride] + (pPred[-1] << 1) + pPred[-1 + iStride[1]] + 2) >> 2);
+  uiPixelFilterT[0] = ((pPred[-1 - kiStride] + (pPred[-kiStride] << 1) + pPred[1 - kiStride] + 2) >> 2);
+  for (i = 1; i < 7; i++) {
+    uiPixelFilterL[i] = ((pPred[-1 + iStride[i - 1]] + (pPred[-1 + iStride[i]] << 1) + pPred[-1 + iStride[i + 1]] + 2) >>
+                         2);
+    uiPixelFilterT[i] = ((pPred[i - 1 - kiStride] + (pPred[i - kiStride] << 1) + pPred[i + 1 - kiStride] + 2) >> 2);
+  }
+  uiPixelFilterL[7] = ((pPred[-1 + iStride[6]] + pPred[-1 + iStride[7]] * 3 + 2) >> 2);
+  uiPixelFilterT[7] = bTRAvail ? ((pPred[6 - kiStride] + (pPred[7 - kiStride] << 1) + pPred[8 - kiStride] + 2) >> 2) : ((
+                        pPred[6 - kiStride] + pPred[7 - kiStride] * 3 + 2) >> 2);
+
+  for (i = 0; i < 8; i++) { // y
+    for (j = 0; j < 8; j++) { // x
+      izHD = (i << 1) - j; // 2*y - x
+      izHDDiv = i - (j >> 1);
+      if (izHD >= 0) {
+        if ((izHD & 0x01) == 0) {  // 8-104
+          if (izHDDiv == 0) {
+            pPred[j + iStride[i]] = (uiPixelFilterTL + uiPixelFilterL[0] + 1) >> 1;
+          } else {
+            pPred[j + iStride[i]] = (uiPixelFilterL[izHDDiv - 1] + uiPixelFilterL[izHDDiv] + 1) >> 1;
+          }
+        } else {  // 8-105
+          if (izHDDiv == 1) {
+            pPred[j + iStride[i]] = (uiPixelFilterTL + (uiPixelFilterL[0] << 1) + uiPixelFilterL[1] + 2) >> 2;
+          } else {
+            pPred[j + iStride[i]] = (uiPixelFilterL[izHDDiv - 2] + (uiPixelFilterL[izHDDiv - 1] << 1) + uiPixelFilterL[izHDDiv] + 2)
+                                    >> 2;
+          }
+        }
+      } else if (izHD == -1) { // 8-106
+        pPred[j + iStride[i]] = (uiPixelFilterL[0] + (uiPixelFilterTL << 1) + uiPixelFilterT[0] + 2) >> 2;
+      } else if (izHD < -2) { // 8-107
+        pPred[j + iStride[i]] = (uiPixelFilterT[-izHD - 1] + (uiPixelFilterT[-izHD - 2] << 1) + uiPixelFilterT[-izHD - 3] + 2)
+                                >> 2;
+      } else { // 8-107 special case, izHD==-2
+        pPred[j + iStride[i]] = (uiPixelFilterT[1] + (uiPixelFilterT[0] << 1) + uiPixelFilterTL + 2) >> 2;
+      }
+    }
+  }
+}
+
+
 void WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride) {
   const uint64_t kuiVal64	= LD64A8 (&pPred[-kiStride]);
   const int32_t kiStride2	= kiStride  << 1;
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -127,7 +127,7 @@
                               || (ERROR_CON_SLICE_COPY_CROSS_IDR_FREEZE_RES_CHANGE == pCtx->eErrorConMethod)
                               || (ERROR_CON_SLICE_MV_COPY_CROSS_IDR == pCtx->eErrorConMethod)
                               || (ERROR_CON_SLICE_MV_COPY_CROSS_IDR_FREEZE_RES_CHANGE == pCtx->eErrorConMethod))
-                              && (NULL != pCtx->pPreviousDecodedPictureInDpb);
+                             && (NULL != pCtx->pPreviousDecodedPictureInDpb);
         bCopyPrevious = bCopyPrevious && (pRef->iWidthInPixel == pCtx->pPreviousDecodedPictureInDpb->iWidthInPixel)
                         && (pRef->iHeightInPixel == pCtx->pPreviousDecodedPictureInDpb->iHeightInPixel);
 
--- a/codec/decoder/core/src/parse_mb_syn_cabac.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cabac.cpp
@@ -35,14 +35,15 @@
 #include "error_code.h"
 namespace WelsDec {
 #define IDX_UNUSED -1
-static const int16_t g_kMaxPos       [] = {IDX_UNUSED, 15, 14, 15, 3, 14, 3, 3, 14, 14};
-static const int16_t g_kMaxC2       [] = {IDX_UNUSED, 4, 4, 4, 3, 4, 3, 3, 4, 4};
-static const int16_t g_kBlockCat2CtxOffsetCBF[] = {IDX_UNUSED, 0, 4, 8, 12, 16, 12, 12, 16, 16};
-static const int16_t g_kBlockCat2CtxOffsetMap [] = {IDX_UNUSED, 0, 15, 29, 44, 47, 44, 44, 47, 47};
-static const int16_t g_kBlockCat2CtxOffsetLast[] = {IDX_UNUSED, 0, 15, 29, 44, 47, 44, 44, 47, 47};
-static const int16_t g_kBlockCat2CtxOffsetOne [] = {IDX_UNUSED, 0 , 10, 20, 30, 39, 30, 30, 39, 39};
-static const int16_t g_kBlockCat2CtxOffsetAbs [] = {IDX_UNUSED, 0 , 10, 20, 30, 39, 30, 30, 39, 39};
 
+static const int16_t g_kMaxPos       [] = {IDX_UNUSED, 15, 14, 15, 3, 14, 63, 3, 3, 14, 14};
+static const int16_t g_kMaxC2       [] = {IDX_UNUSED, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4};
+static const int16_t g_kBlockCat2CtxOffsetCBF[] = {IDX_UNUSED, 0, 4, 8, 12, 16, 0, 12, 12, 16, 16};
+static const int16_t g_kBlockCat2CtxOffsetMap [] = {IDX_UNUSED, 0, 15, 29, 44, 47, 0, 44, 44, 47, 47};
+static const int16_t g_kBlockCat2CtxOffsetLast[] = {IDX_UNUSED, 0, 15, 29, 44, 47, 0, 44, 44, 47, 47};
+static const int16_t g_kBlockCat2CtxOffsetOne [] = {IDX_UNUSED, 0 , 10, 20, 30, 39, 0, 30, 30, 39, 39};
+static const int16_t g_kBlockCat2CtxOffsetAbs [] = {IDX_UNUSED, 0 , 10, 20, 30, 39, 0, 30, 30, 39, 39};
+
 const uint8_t g_kTopBlkInsideMb[24] = { //for index with z-order 0~23
   //  0   1 | 4  5      luma 8*8 block           pNonZeroCount[16+8]
   0,  0,  1,  1,   //  2   3 | 6  7        0  |  1                  0   1   2   3
@@ -275,6 +276,24 @@
   }
   return ERR_NONE;
 }
+
+int32_t ParseTransformSize8x8FlagCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail,
+                                        bool& bTransformSize8x8Flag) {
+  uint32_t uiCode;
+  int32_t iIdxA, iIdxB;
+  int32_t iCtxInc;
+  PWelsCabacDecEngine pCabacDecEngine = pCtx->pCabacDecEngine;
+  PWelsCabacCtx pBinCtx = pCtx->pCabacCtx + NEW_CTX_OFFSET_TS_8x8_FLAG;
+  iIdxA = (pNeighAvail->iLeftAvail) && (pCtx->pCurDqLayer->pTransformSize8x8Flag[pCtx->pCurDqLayer->iMbXyIndex - 1]);
+  iIdxB = (pNeighAvail->iTopAvail)
+          && (pCtx->pCurDqLayer->pTransformSize8x8Flag[pCtx->pCurDqLayer->iMbXyIndex - pCtx->pCurDqLayer->iMbWidth]);
+  iCtxInc = iIdxA + iIdxB;
+  WELS_READ_VERIFY (DecodeBinCabac (pCabacDecEngine, pBinCtx + iCtxInc, uiCode));
+  bTransformSize8x8Flag = !!uiCode;
+
+  return ERR_NONE;
+}
+
 int32_t ParseSubMBTypeCabac (PWelsDecoderContext pCtx, PWelsNeighAvail pNeighAvail, uint32_t& uiSubMbType) {
   uint32_t uiCode;
   PWelsCabacDecEngine pCabacDecEngine = pCtx->pCabacDecEngine;
@@ -471,6 +490,9 @@
       pCurDqLayer->pSubMbType[iMbXy][i] = g_ksInterSubMbTypeInfo[uiSubMbType].iType;
       pSubPartCount[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartCount;
       pPartW[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartWidth;
+
+      // Need modification when B picture add in, reference to 7.3.5
+      pCurDqLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] &= (uiSubMbType == 0);
     }
 
     for (i = 0; i < 4; i++) {
@@ -721,7 +743,7 @@
   int32_t iCurrBlkXy = pCtx->pCurDqLayer->iMbXyIndex;
   int32_t iTopBlkXy = iCurrBlkXy - pCtx->pCurDqLayer->iMbWidth; //default value: MB neighboring
   int32_t iLeftBlkXy = iCurrBlkXy - 1; //default value: MB neighboring
-  uint8_t* pCbfDc = pCtx->pCurDqLayer->pCbfDc;
+  uint16_t* pCbfDc = pCtx->pCurDqLayer->pCbfDc;
   int16_t* pMbType = pCtx->pCurDqLayer->pMbType;
   int32_t iCtxInc;
   uiCbfBit = 0;
@@ -760,22 +782,30 @@
 int32_t ParseSignificantMapCabac (int32_t* pSignificantMap, int32_t iResProperty, PWelsDecoderContext pCtx,
                                   uint32_t& uiCoeffNum) {
   uint32_t uiCode;
-  PWelsCabacCtx pMapCtx  = pCtx->pCabacCtx + NEW_CTX_OFFSET_MAP + g_kBlockCat2CtxOffsetMap [iResProperty];
-  PWelsCabacCtx pLastCtx = pCtx->pCabacCtx + NEW_CTX_OFFSET_LAST + g_kBlockCat2CtxOffsetLast[iResProperty];
 
+  PWelsCabacCtx pMapCtx  = pCtx->pCabacCtx + (iResProperty == LUMA_DC_AC_8 ? NEW_CTX_OFFSET_MAP_8x8 : NEW_CTX_OFFSET_MAP)
+                           + g_kBlockCat2CtxOffsetMap [iResProperty];
+  PWelsCabacCtx pLastCtx = pCtx->pCabacCtx + (iResProperty == LUMA_DC_AC_8 ? NEW_CTX_OFFSET_LAST_8x8 :
+                           NEW_CTX_OFFSET_LAST) + g_kBlockCat2CtxOffsetLast[iResProperty];
+
+
   int32_t i;
   uiCoeffNum = 0;
   int32_t i0 = 0;
   int32_t i1 = g_kMaxPos[iResProperty];
 
+  int32_t iCtx;
+
   for (i = i0; i < i1; ++i) {
+    iCtx = (iResProperty == LUMA_DC_AC_8 ? g_kuiIdx2CtxSignificantCoeffFlag8x8[i] : i);
     //read significant
-    WELS_READ_VERIFY (DecodeBinCabac (pCtx->pCabacDecEngine, pMapCtx + i, uiCode));
+    WELS_READ_VERIFY (DecodeBinCabac (pCtx->pCabacDecEngine, pMapCtx + iCtx, uiCode));
     if (uiCode) {
       * (pSignificantMap++) = 1;
       ++ uiCoeffNum;
       //read last significant
-      WELS_READ_VERIFY (DecodeBinCabac (pCtx->pCabacDecEngine, pLastCtx + i, uiCode));
+      iCtx = (iResProperty == LUMA_DC_AC_8 ? g_kuiIdx2CtxLastSignificantCoeffFlag8x8[i] : i);
+      WELS_READ_VERIFY (DecodeBinCabac (pCtx->pCabacDecEngine, pLastCtx + iCtx, uiCode));
       if (uiCode) {
         memset (pSignificantMap, 0, (i1 - i) * sizeof (int32_t));
         return ERR_NONE;
@@ -796,8 +826,11 @@
 
 int32_t ParseSignificantCoeffCabac (int32_t* pSignificant, int32_t iResProperty, PWelsDecoderContext pCtx) {
   uint32_t uiCode;
-  PWelsCabacCtx pOneCtx = pCtx->pCabacCtx + NEW_CTX_OFFSET_ONE + g_kBlockCat2CtxOffsetOne[iResProperty];
-  PWelsCabacCtx pAbsCtx = pCtx->pCabacCtx + NEW_CTX_OFFSET_ABS + g_kBlockCat2CtxOffsetAbs[iResProperty];
+  PWelsCabacCtx pOneCtx = pCtx->pCabacCtx + (iResProperty == LUMA_DC_AC_8 ? NEW_CTX_OFFSET_ONE_8x8 : NEW_CTX_OFFSET_ONE) +
+                          g_kBlockCat2CtxOffsetOne[iResProperty];
+  PWelsCabacCtx pAbsCtx = pCtx->pCabacCtx + (iResProperty == LUMA_DC_AC_8 ? NEW_CTX_OFFSET_ABS_8x8 : NEW_CTX_OFFSET_ABS) +
+                          g_kBlockCat2CtxOffsetAbs[iResProperty];
+
   const int16_t iMaxType = g_kMaxC2[iResProperty];
   int32_t i = g_kMaxPos[iResProperty];
   int32_t* pCoff = pSignificant + i;
@@ -823,6 +856,46 @@
     }
     pCoff--;
   }
+  return ERR_NONE;
+}
+
+int32_t ParseResidualBlockCabac8x8 (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCountCache, SBitStringAux* pBsAux,
+                                    int32_t iIndex, int32_t iMaxNumCoeff, const uint8_t* pScanTable, int32_t iResProperty,
+                                    short* sTCoeff, /*int mb_mode*/ uint8_t uiQp, PWelsDecoderContext pCtx) {
+  uint32_t uiTotalCoeffNum = 0;
+  uint32_t uiCbpBit;
+  int32_t pSignificantMap[64] = {0};
+
+  int32_t iMbResProperty = 0;
+  GetMbResProperty (&iMbResProperty, &iResProperty, false);
+  const uint16_t* pDeQuantMul = (pCtx->bUseScalingList) ? pCtx->pDequant_coeff8x8[iMbResProperty - 6][uiQp] :
+                                g_kuiDequantCoeff8x8[uiQp];
+
+  uiCbpBit = 1; // for 8x8, MaxNumCoeff == 64 && uiCbpBit == 1
+  if (uiCbpBit) { //has coeff
+    WELS_READ_VERIFY (ParseSignificantMapCabac (pSignificantMap, iResProperty, pCtx, uiTotalCoeffNum));
+    WELS_READ_VERIFY (ParseSignificantCoeffCabac (pSignificantMap, iResProperty, pCtx));
+  }
+
+  pNonZeroCountCache[g_kCacheNzcScanIdx[iIndex]] =
+    pNonZeroCountCache[g_kCacheNzcScanIdx[iIndex + 1]] =
+      pNonZeroCountCache[g_kCacheNzcScanIdx[iIndex + 2]] =
+        pNonZeroCountCache[g_kCacheNzcScanIdx[iIndex + 3]] = (uint8_t)uiTotalCoeffNum;
+  if (uiTotalCoeffNum == 0) {
+    return ERR_NONE;
+  }
+  int32_t j = 0, i;
+  if (iResProperty == LUMA_DC_AC_8) {
+    do {
+      if (pSignificantMap[j] != 0) {
+        i = pScanTable[ j ];
+        sTCoeff[i] = uiQp >= 36 ? ((pSignificantMap[j] * pDeQuantMul[i]) << (uiQp / 6 - 6)) : ((
+                       pSignificantMap[j] * pDeQuantMul[i] + (1 << (5 - uiQp / 6))) >> (6 - uiQp / 6));
+      }
+      ++j;
+    } while (j < 64);
+  }
+
   return ERR_NONE;
 }
 
--- a/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
+++ b/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
@@ -151,7 +151,7 @@
       pNonZeroCount[5 + 8 * 5] = -1;//unavailable
   }
 }
-void WelsFillCacheConstrain1Intra4x4 (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+void WelsFillCacheConstrain1IntraNxN (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
                                       PDqLayer pCurLayer) { //no matter slice type
   int32_t iCurXy  = pCurLayer->iMbXyIndex;
   int32_t iTopXy  = 0;
@@ -197,7 +197,7 @@
   }
 }
 
-void WelsFillCacheConstrain0Intra4x4 (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
+void WelsFillCacheConstrain0IntraNxN (PWelsNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode,
                                       PDqLayer pCurLayer) { //no matter slice type
   int32_t iCurXy  = pCurLayer->iMbXyIndex;
   int32_t iTopXy  = 0;
@@ -214,7 +214,7 @@
   }
 
   //intra4x4_pred_mode
-  if (pNeighAvail->iTopAvail && IS_INTRA4x4 (pNeighAvail->iTopType)) { //top
+  if (pNeighAvail->iTopAvail && IS_INTRANxN (pNeighAvail->iTopType)) { //top
     ST32 (pIntraPredMode + 1, LD32 (&pCurLayer->pIntraPredMode[iTopXy][0]));
   } else {
     int32_t iPred;
@@ -225,7 +225,7 @@
     ST32 (pIntraPredMode + 1, iPred);
   }
 
-  if (pNeighAvail->iLeftAvail && IS_INTRA4x4 (pNeighAvail->iLeftType)) { //left
+  if (pNeighAvail->iLeftAvail && IS_INTRANxN (pNeighAvail->iLeftType)) { //left
     pIntraPredMode[ 0 + 8 * 1] = pCurLayer->pIntraPredMode[iLeftXy][4];
     pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
     pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
@@ -565,12 +565,13 @@
   return 0;
 }
 
-int32_t CheckIntra4x4PredMode (int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex) {
+int32_t CheckIntraNxNPredMode (int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex, bool b8x8) {
   int8_t iIdx = g_kuiCache30ScanIdx[iIndex];
+
   int32_t iLeftAvail     = pSampleAvail[iIdx - 1];
   int32_t iTopAvail      = pSampleAvail[iIdx - 6];
   int32_t bLeftTopAvail  = pSampleAvail[iIdx - 7];
-  int32_t bRightTopAvail = pSampleAvail[iIdx - 5];
+  int32_t bRightTopAvail = pSampleAvail[iIdx - (b8x8 ? 4 : 5)];  // Diff with 4x4 Pred
 
   int8_t iFinalMode;
 
@@ -900,6 +901,93 @@
   return 0;
 }
 
+int32_t WelsResidualBlockCavlc8x8 (SVlcTable* pVlcTable, uint8_t* pNonZeroCountCache, PBitStringAux pBs, int32_t iIndex,
+                                   int32_t iMaxNumCoeff, const uint8_t* kpZigzagTable, int32_t iResidualProperty,
+                                   int16_t* pTCoeff, int32_t  iIdx4x4, uint8_t uiQp,
+                                   PWelsDecoderContext pCtx) {
+  int32_t iLevel[16], iZerosLeft, iCoeffNum;
+  int32_t  iRun[16];
+  int32_t iCurNonZeroCacheIdx, i;
+
+  int32_t iMbResProperty = 0;
+  GetMbResProperty (&iMbResProperty, &iResidualProperty, 1);
+
+  const uint16_t* kpDequantCoeff = pCtx->bUseScalingList ? pCtx->pDequant_coeff8x8[iMbResProperty - 6][uiQp] :
+                                   g_kuiDequantCoeff8x8[uiQp];
+
+  int8_t nA, nB, nC;
+  uint8_t uiTotalCoeff, uiTrailingOnes;
+  int32_t iUsedBits = 0;
+  intX_t iCurIdx   = pBs->iIndex;
+  uint8_t* pBuf     = ((uint8_t*)pBs->pStartBuf) + (iCurIdx >> 3);
+  bool  bChromaDc = (CHROMA_DC == iResidualProperty);
+  uint8_t bChroma   = (bChromaDc || CHROMA_AC == iResidualProperty);
+  SReadBitsCache sReadBitsCache;
+
+  uint32_t uiCache32Bit = (uint32_t) ((((pBuf[0] << 8) | pBuf[1]) << 16) | (pBuf[2] << 8) | pBuf[3]);
+  sReadBitsCache.uiCache32Bit = uiCache32Bit << (iCurIdx & 0x07);
+  sReadBitsCache.uiRemainBits = 32 - (iCurIdx & 0x07);
+  sReadBitsCache.pBuf = pBuf;
+  //////////////////////////////////////////////////////////////////////////
+
+  if (bChroma) {
+    iCurNonZeroCacheIdx = g_kuiCache48CountScan4Idx[iIndex];
+    nA = pNonZeroCountCache[iCurNonZeroCacheIdx - 1];
+    nB = pNonZeroCountCache[iCurNonZeroCacheIdx - 8];
+  } else { //luma
+    iCurNonZeroCacheIdx = g_kuiCache48CountScan4Idx[iIndex];
+    nA = pNonZeroCountCache[iCurNonZeroCacheIdx - 1];
+    nB = pNonZeroCountCache[iCurNonZeroCacheIdx - 8];
+  }
+
+  WELS_NON_ZERO_COUNT_AVERAGE (nC, nA, nB);
+
+  iUsedBits += CavlcGetTrailingOnesAndTotalCoeff (uiTotalCoeff, uiTrailingOnes, &sReadBitsCache, pVlcTable, bChromaDc,
+               nC);
+
+  if (iResidualProperty != CHROMA_DC && iResidualProperty != I16_LUMA_DC) {
+    pNonZeroCountCache[iCurNonZeroCacheIdx] = uiTotalCoeff;
+    //////////////////////////////////////////////////////////////////////////
+  }
+  if (0 == uiTotalCoeff) {
+    pBs->iIndex += iUsedBits;
+    return 0;
+  }
+  if ((uiTrailingOnes > 3) || (uiTotalCoeff > 16)) { /////////////////check uiTrailingOnes and uiTotalCoeff
+    return ERR_INFO_CAVLC_INVALID_TOTAL_COEFF_OR_TRAILING_ONES;
+  }
+  if ((i = CavlcGetLevelVal (iLevel, &sReadBitsCache, uiTotalCoeff, uiTrailingOnes)) == -1) {
+    return ERR_INFO_CAVLC_INVALID_LEVEL;
+  }
+  iUsedBits += i;
+  if (uiTotalCoeff < iMaxNumCoeff) {
+    iUsedBits += CavlcGetTotalZeros (iZerosLeft, &sReadBitsCache, uiTotalCoeff, pVlcTable, bChromaDc);
+  } else {
+    iZerosLeft = 0;
+  }
+
+  if ((iZerosLeft < 0) || ((iZerosLeft + uiTotalCoeff) > iMaxNumCoeff)) {
+    return ERR_INFO_CAVLC_INVALID_ZERO_LEFT;
+  }
+  if ((i = CavlcGetRunBefore (iRun, &sReadBitsCache, uiTotalCoeff, pVlcTable, iZerosLeft)) == -1) {
+    return ERR_INFO_CAVLC_INVALID_RUN_BEFORE;
+  }
+  iUsedBits += i;
+  pBs->iIndex += iUsedBits;
+  iCoeffNum = -1;
+
+  for (i = uiTotalCoeff - 1; i >= 0; --i) { //FIXME merge into  rundecode?
+    int32_t j;
+    iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+    j = (iCoeffNum << 2) + iIdx4x4;
+    j          = kpZigzagTable[ j ];
+    pTCoeff[j] = uiQp >= 36 ? ((iLevel[i] * kpDequantCoeff[j]) << (uiQp / 6 - 6))
+                 : ((iLevel[i] * kpDequantCoeff[j] + (1 << (5 - uiQp / 6))) >> (6 - uiQp / 6));
+  }
+
+  return 0;
+}
+
 int32_t ParseInterInfo (PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30],
                         PBitStringAux pBs) {
   PSlice pSlice				= &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer;
@@ -941,7 +1029,8 @@
           return ERR_INFO_INVALID_REF_INDEX;
         }
       }
-      pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || !(ppRefPic[iRefIdx]&&ppRefPic[iRefIdx]->bIsComplete);
+      pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx]
+                              && ppRefPic[iRefIdx]->bIsComplete);
     } else {
       WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
       return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -981,7 +1070,8 @@
           return ERR_INFO_INVALID_REF_INDEX;
         }
       }
-      pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || !(ppRefPic[iRefIdx[i]]&&ppRefPic[iRefIdx[i]]->bIsComplete);
+      pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx[i]]
+                              && ppRefPic[iRefIdx[i]]->bIsComplete);
     }
     for (i = 0; i < 2; i++) {
       PredInter16x8Mv (iMvArray, iRefIdxArray, i << 3, iRefIdx[i], iMv);
@@ -1017,7 +1107,8 @@
             return ERR_INFO_INVALID_REF_INDEX;
           }
         }
-        pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || !(ppRefPic[iRefIdx[i]]&&ppRefPic[iRefIdx[i]]->bIsComplete);
+        pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx[i]]
+                                && ppRefPic[iRefIdx[i]]->bIsComplete);
       } else {
         WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. ");
         return GENERATE_ERROR_NO (ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
@@ -1056,6 +1147,9 @@
       pCurDqLayer->pSubMbType[iMbXy][i] = g_ksInterSubMbTypeInfo[uiSubMbType].iType;
       iSubPartCount[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartCount;
       iPartWidth[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartWidth;
+
+      // Need modification when B picture add in, reference to 7.3.5
+      pCurDqLayer->pNoSubMbPartSizeLessThan8x8Flag[iMbXy] &= (uiSubMbType == 0);
     }
 
     if (pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag) {
@@ -1085,7 +1179,8 @@
               return ERR_INFO_INVALID_REF_INDEX;
             }
           }
-          pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || !(ppRefPic[iRefIdx[i]]&&ppRefPic[iRefIdx[i]]->bIsComplete);
+          pCtx->bMbRefConcealed = pCtx->bRPLRError || pCtx->bMbRefConcealed || ! (ppRefPic[iRefIdx[i]]
+                                  && ppRefPic[iRefIdx[i]]->bIsComplete);
 
           pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx  ] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx + 1] =
                 pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx + 4] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx + 5] = iRefIdx[i];
--- a/codec/decoder/core/src/rec_mb.cpp
+++ b/codec/decoder/core/src/rec_mb.cpp
@@ -61,11 +61,65 @@
   }
 }
 
+int32_t RecI8x8Mb (int32_t iMbXy, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  RecI8x8Luma (iMbXy, pCtx, pScoeffLevel, pDqLayer);
+  RecI4x4Chroma (iMbXy, pCtx, pScoeffLevel, pDqLayer);
+  return ERR_NONE;
+}
+
+int32_t RecI8x8Luma (int32_t iMbXy, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
+  /*****get local variable from outer variable********/
+  /*prediction info*/
+  uint8_t* pPred = pDqLayer->pPred[0];
+
+  int32_t iLumaStride = pDqLayer->iLumaStride;
+  int32_t* pBlockOffset = pCtx->iDecBlockOffsetArray;
+  PGetIntraPred8x8Func* pGetI8x8LumaPredFunc = pCtx->pGetI8x8LumaPredFunc;
+
+  int8_t* pIntra8x8PredMode = pDqLayer->pIntra4x4FinalMode[iMbXy]; // I_NxN
+  int16_t* pRS = pScoeffLevel;
+  /*itransform info*/
+  PIdctResAddPredFunc	pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc8x8;
+
+  /*************local variable********************/
+  uint8_t i = 0;
+  bool bTLAvail[4], bTRAvail[4];
+  // Top-Right : Left : Top-Left : Top
+  bTLAvail[0] = !! (pDqLayer->pIntraNxNAvailFlag[iMbXy] & 0x02);
+  bTLAvail[1] = !! (pDqLayer->pIntraNxNAvailFlag[iMbXy] & 0x01);
+  bTLAvail[2] = !! (pDqLayer->pIntraNxNAvailFlag[iMbXy] & 0x04);
+  bTLAvail[3] = true;
+
+  bTRAvail[0] = !! (pDqLayer->pIntraNxNAvailFlag[iMbXy] & 0x01);
+  bTRAvail[1] = !! (pDqLayer->pIntraNxNAvailFlag[iMbXy] & 0x08);
+  bTRAvail[2] = true;
+  bTRAvail[3] = false;
+
+  /*************real process*********************/
+  for (i = 0; i < 4; i++) {
+
+    uint8_t* pPredI8x8 = pPred + pBlockOffset[i << 2];
+    uint8_t uiMode = pIntra8x8PredMode[g_kuiScan4[i << 2]];
+
+    pGetI8x8LumaPredFunc[uiMode] (pPredI8x8, iLumaStride, bTLAvail[i], bTRAvail[i]);
+
+    int32_t iIndex = g_kuiMbCountScan4Idx[i << 2];
+    if (pDqLayer->pNzc[iMbXy][iIndex] || pDqLayer->pNzc[iMbXy][iIndex + 1] || pDqLayer->pNzc[iMbXy][iIndex + 4]
+        || pDqLayer->pNzc[iMbXy][iIndex + 5]) {
+      int16_t* pRSI8x8 = &pRS[i << 6];
+      pIdctResAddPredFunc (pPredI8x8, iLumaStride, pRSI8x8);
+    }
+  }
+
+  return ERR_NONE;
+}
+
 int32_t RecI4x4Mb (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
   RecI4x4Luma (iMBXY, pCtx, pScoeffLevel, pDqLayer);
   RecI4x4Chroma (iMBXY, pCtx, pScoeffLevel, pDqLayer);
   return ERR_NONE;
 }
+
 
 int32_t RecI4x4Luma (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t* pScoeffLevel, PDqLayer pDqLayer) {
   /*****get local variable from outer variable********/
--- a/test/decoder/DecUT_DeblockCommon.cpp
+++ b/test/decoder/DecUT_DeblockCommon.cpp
@@ -721,6 +721,10 @@
   sDqLayer.iMbY = 0; //Only for test easy
   sDqLayer.iMbXyIndex = 1;  // this function has NO iMbXyIndex validation
 
+  bool bTSize8x8Flag[50] = {false};
+  sDqLayer.pTransformSize8x8Flag = bTSize8x8Flag;
+  sDqLayer.pTransformSize8x8Flag[sDqLayer.iMbXyIndex] = false;
+
 #define UT_DB_LUMA_TEST(iFlag, iQP, iV0, iV1, iV2) \
   iBoundryFlag = iFlag; \
   memset(iLumaQP, iQP, sizeof(int8_t)*50); \
@@ -777,6 +781,10 @@
   sDqLayer.pMv[0] = (int16_t (*) [16][2])&iLayerMv[0];
   sDqLayer.pMv[1] = (int16_t (*) [16][2])&iLayerMv[1];
 
+  bool bTSize8x8Flag[50] = {false};
+  sDqLayer.pTransformSize8x8Flag = bTSize8x8Flag;
+  memset (bTSize8x8Flag, 0, sizeof (bool) * 50);
+
 #define UT_DB_CLEAN_STATUS \
   memset(iNoZeroCount, 0, sizeof(int8_t)*24*2); \
   memset(iLayerRefIndex, 0, sizeof(int8_t)*2*16*2); \
@@ -883,6 +891,10 @@
   sDqLayer.iMbXyIndex = 1;
   sDqLayer.iMbWidth = 1;
 
+  bool bTSize8x8Flag[50] = {false};
+  sDqLayer.pTransformSize8x8Flag = bTSize8x8Flag;
+  memset (bTSize8x8Flag, 0, sizeof (bool) * 50);
+
   uint8_t iY[50] = {0};
   sFilter.pCsData[0] = iY;
   sFilter.iCsStride[0] = 4;
@@ -922,12 +934,12 @@
   EXPECT_TRUE(iCb[2<<1]==iChromaV1 && iCr[2<<1]==iChromaV1)<<iQP<<" "<<sDqLayer.pMbType[1]; \
   EXPECT_TRUE(iCb[(2<<1)*sFilter.iCsStride[1]]==iChromaV2 && iCr[(2<<1)*sFilter.iCsStride[1]]==iChromaV2)<<iQP<<" "<<sDqLayer.pMbType[1];
 
-  // QP>16, LEFT & TOP, Intra mode MB_TYPE_INTRA4x4 
+  // QP>16, LEFT & TOP, Intra mode MB_TYPE_INTRA4x4
   iQP = 16 + rand() % 35;
   sDqLayer.pMbType[1] = MB_TYPE_INTRA4x4;
   UT_DB_MACROBLOCK_TEST (0x03, iQP, 2, 1, 1, 2, 1, 1)
 
-  // QP>16, LEFT & TOP, Intra mode MB_TYPE_INTRA16x16 
+  // QP>16, LEFT & TOP, Intra mode MB_TYPE_INTRA16x16
   iQP = 16 + rand() % 35;
   sDqLayer.pMbType[1] = MB_TYPE_INTRA16x16;
   UT_DB_MACROBLOCK_TEST (0x03, iQP, 2, 1, 1, 2, 1, 1)
@@ -934,7 +946,7 @@
 
   // MbType==0x03, Intra8x8 has not been supported now.
 
-  // QP>16, LEFT & TOP, Intra mode MB_TYPE_INTRA_PCM 
+  // QP>16, LEFT & TOP, Intra mode MB_TYPE_INTRA_PCM
   iQP = 16 + rand() % 35;
   sDqLayer.pMbType[1] = MB_TYPE_INTRA_PCM;
   UT_DB_MACROBLOCK_TEST (0x03, iQP, 2, 1, 1, 2, 1, 1)