shithub: openh264

Download patch

ref: 906dacd34972e42819dab320960ffcfb7b84aada
parent: 9e2abda78f0fc0e6a4c2a7a3e2e3067404acdade
author: gxw <guxiwei-hf@loongson.cn>
date: Fri Aug 10 13:46:41 EDT 2018

Add optimization files in codec/encoder/core/mips

Add dct_mmi.c, quant_mmi.c and score_mmi.c in codec/encoder/core/mips

Change-Id: I5955558968d68c1ff62ec5788c6a1a5f5c9bb28f

--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -64,7 +64,11 @@
 OBJS += $(COMMON_OBJSARM64)
 
 COMMON_ASM_MIPS_SRCS=\
+	$(COMMON_SRCDIR)/mips/copy_mb_mmi.c\
 	$(COMMON_SRCDIR)/mips/deblock_mmi.c\
+	$(COMMON_SRCDIR)/mips/expand_picture_mmi.c\
+	$(COMMON_SRCDIR)/mips/intra_pred_com_mmi.c\
+	$(COMMON_SRCDIR)/mips/satd_sad_mmi.c\
 
 COMMON_OBJSMIPS += $(COMMON_ASM_MIPS_SRCS:.c=.$(OBJ))
 ifeq ($(ASM_ARCH), mips)
--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -95,6 +95,11 @@
                                  int16_t* pDctDc);
 #endif
 
+#if defined(HAVE_MMI)
+void WelsIDctT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctFourT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctRecI16x16Dc_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -147,6 +147,33 @@
 void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
 void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
 #endif
+
+#ifdef HAVE_MMI
+int32_t WelsGetNoneZeroCount_mmi (int16_t* pLevel);
+
+/****************************************************************************
+ *  * Scan and Score functions
+ *   ****************************************************************************/
+void WelsScan4x4Ac_mmi (int16_t* zig_value, int16_t* pDct);
+void WelsScan4x4DcAc_mmi (int16_t* pLevel, int16_t* pDct);
+int32_t WelsCalculateSingleCtr4x4_mmi (int16_t* pDct);
+
+/****************************************************************************
+ *  * DCT functions
+ *   ****************************************************************************/
+void WelsDctT4_mmi (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctFourT4_mmi (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+
+/****************************************************************************
+ *  * HDM and Quant functions
+ *   ****************************************************************************/
+void WelsHadamardT4Dc_mmi (int16_t* pLumaDc, int16_t* pDct);
+
+void WelsQuant4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuant4x4Dc_mmi (int16_t* pDct, int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuantFour4x4Max_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- /dev/null
+++ b/codec/encoder/core/mips/dct_mmi.c
@@ -1,0 +1,529 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    dct_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_Load4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+  "gslqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  "gslqc1     "#f10", "#f8", 0x10("#r0")      \n\t" \
+  "gslqc1     "#f18", "#f16", 0x20("#r0")     \n\t" \
+  "gslqc1     "#f6", "#f4", 0x30("#r0")       \n\t" \
+  MMI_XSawp_DQ(f8, f10, f4, f6, f12, f14)           \
+  MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)
+
+#define MMI_SumSubDiv2(f0, f2, f4, f6, f8, f10, f12, f14, f16) \
+  "mov.d      "#f8", "#f4"                    \n\t" \
+  "mov.d      "#f10", "#f6"                   \n\t" \
+  "psrah      "#f4", "#f4", "#f16"            \n\t" \
+  "psrah      "#f6", "#f6", "#f16"            \n\t" \
+  "psrah      "#f12", "#f0", "#f16"           \n\t" \
+  "psrah      "#f14", "#f2", "#f16"           \n\t" \
+  "paddh      "#f0", "#f0", "#f4"             \n\t" \
+  "paddh      "#f2", "#f2", "#f6"             \n\t" \
+  "psubh      "#f12", "#f12", "#f8"           \n\t" \
+  "psubh      "#f14", "#f14", "#f10"          \n\t"
+
+#define MMI_IDCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28) \
+  MMI_SumSub(f24, f26, f4, f6, f20, f22)                        \
+  MMI_SumSubDiv2(f0, f2, f8, f10, f16, f18, f12, f14, f28)      \
+  MMI_SumSub(f4, f6, f0, f2, f16, f18)                          \
+  MMI_SumSub(f24, f26, f12, f14, f16, f18)
+
+#define MMI_StoreDiff8p_6(f0, f2, f4, f6, f8, f12, r0, r1, f14) \
+  "paddh      "#f0", "#f0", "#f8"             \n\t" \
+  "paddh      "#f2", "#f2", "#f8"             \n\t" \
+  "psrah      "#f0", "#f0", "#f14"            \n\t" \
+  "psrah      "#f2", "#f2", "#f14"            \n\t" \
+  "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
+  "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
+  "punpckhbh  "#f6", "#f4", "#f12"            \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f12"            \n\t" \
+  "paddsh     "#f4", "#f4", "#f0"             \n\t" \
+  "paddsh     "#f6", "#f6", "#f2"             \n\t" \
+  "packushb   "#f4", "#f4", "#f6"             \n\t" \
+  "gssdlc1    "#f4", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f4", 0x0("#r0")               \n\t"
+
+#define MMI_StoreDiff8p_5(f0, f2, f4, f6, f8, r0, r1, offset) \
+  "gsldlc1    "#f4", "#offset"+0x7("#r1")     \n\t" \
+  "gsldrc1    "#f4", "#offset"+0x0("#r1")     \n\t" \
+  "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
+  "paddsh     "#f4", "#f4", "#f0"             \n\t" \
+  "paddsh     "#f6", "#f6", "#f2"             \n\t" \
+  "packushb   "#f4", "#f4", "#f6"             \n\t" \
+  "gssdlc1    "#f4", "#offset"+0x7("#r0")     \n\t" \
+  "gssdrc1    "#f4", "#offset"+0x0("#r0")     \n\t"
+
+#define MMI_Load8DC(f0, f2, f4, f6, f8, f10, f12, f14, f16, r0, offset, f20) \
+  "gslqc1     "#f2", "#f0", "#offset"+0x0("#r0") \n\t" \
+  "paddh      "#f0", "#f0", "#f16"               \n\t" \
+  "paddh      "#f2", "#f2", "#f16"               \n\t" \
+  "psrah      "#f0", "#f0", "#f20"               \n\t" \
+  "psrah      "#f2", "#f2", "#f20"               \n\t" \
+  "punpckhhw  "#f4", "#f0", "#f0"                \n\t" \
+  "punpckhwd  "#f6", "#f4", "#f4"                \n\t" \
+  "punpcklwd  "#f4", "#f4", "#f4"                \n\t" \
+  "punpcklhw  "#f8", "#f2", "#f2"                \n\t" \
+  "punpckhwd  "#f10", "#f8", "#f8"               \n\t" \
+  "punpcklwd  "#f8", "#f8", "#f8"                \n\t" \
+  "punpckhhw  "#f12", "#f2", "#f2"               \n\t" \
+  "punpckhwd  "#f14", "#f12", "#f12"             \n\t" \
+  "punpcklwd  "#f12", "#f12", "#f12"             \n\t" \
+  "punpcklhw  "#f0", "#f0", "#f0"                \n\t" \
+  "punpckhwd  "#f2", "#f0", "#f0"                \n\t" \
+  "punpcklwd  "#f0", "#f0", "#f0"                \n\t"
+
+#define MMI_StoreDiff4x8p(f0, f2, f4, f6, f8, f10, f12, r0, r1, r2, r3) \
+  MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0)         \
+  MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)         \
+  PTR_ADDU   ""#r0", "#r0", "#r2"                        \n\t" \
+  PTR_ADDU   ""#r1", "#r1", "#r3"                        \n\t" \
+  MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0)         \
+  MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)
+
+#define MMI_Load4Col(f0, f2, f4, f6, f8, r0, offset) \
+  "lh         $8, "#offset"("#r0")        \n\t" \
+  "dmtc1      $8, "#f0"                   \n\t" \
+  "lh         $8, "#offset"+0x20("#r0")   \n\t" \
+  "dmtc1      $8, "#f4"                   \n\t" \
+  "punpcklwd  "#f0", "#f0", "#f4"         \n\t" \
+  "lh         $8, "#offset"+0x80("#r0")   \n\t" \
+  "dmtc1      $8, "#f6"                   \n\t" \
+  "lh         $8, "#offset"+0xa0("#r0")   \n\t" \
+  "dmtc1      $8, "#f8"                   \n\t" \
+  "punpcklwd  "#f2", "#f6", "#f8"         \n\t"
+
+#define MMI_SumSubD(f0, f2, f4, f6, f8, f10) \
+  "mov.d      "#f8", "#f4"                \n\t" \
+  "mov.d      "#f10", "#f6"               \n\t" \
+  "paddw      "#f4", "#f4", "#f0"         \n\t" \
+  "paddw      "#f6", "#f6", "#f2"         \n\t" \
+  "psubw      "#f0", "#f0", "#f8"         \n\t" \
+  "psubw      "#f2", "#f2", "#f10"        \n\t"
+
+#define WELS_DD1(f0, f2, f_val_31) \
+  "pcmpeqh    "#f0", "#f0", "#f0"         \n\t" \
+  "pcmpeqh    "#f2", "#f2", "#f2"         \n\t" \
+  "psrlw      "#f0", "#f0", "#f_val_31"   \n\t" \
+  "psrlw      "#f2", "#f2", "#f_val_31"   \n\t"
+
+#define MMI_SumSubDiv2D(f0, f2, f4, f6, f8, f10, f12, f14, f_val_1) \
+  "paddw      "#f0", "#f0", "#f4"         \n\t" \
+  "paddw      "#f2", "#f2", "#f6"         \n\t" \
+  "paddw      "#f0", "#f0", "#f8"         \n\t" \
+  "paddw      "#f2", "#f2", "#f10"        \n\t" \
+  "psraw      "#f0", "#f0", "#f_val_1"    \n\t" \
+  "psraw      "#f2", "#f2", "#f_val_1"    \n\t" \
+  "mov.d      "#f12", "#f0"               \n\t" \
+  "mov.d      "#f14", "#f2"               \n\t" \
+  "psubw      "#f12", "#f12", "#f4"       \n\t" \
+  "psubw      "#f14", "#f14", "#f6"       \n\t"
+
+#define MMI_Trans4x4W(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+  MMI_XSawp_WD(f0, f2, f4, f6, f16, f18)  \
+  MMI_XSawp_WD(f8, f10, f12, f14, f4, f6) \
+  MMI_XSawp_DQ(f0, f2, f8, f10, f12, f14) \
+  MMI_XSawp_DQ(f16, f18, f4, f6, f8, f10)
+
+#define MMI_SumSubMul2(f0, f2, f4, f6, f8, f10) \
+  "mov.d      "#f8", "#f0"                    \n\t" \
+  "mov.d      "#f10", "#f2"                   \n\t" \
+  "paddh      "#f0", "#f0", "#f0"             \n\t" \
+  "paddh      "#f2", "#f2", "#f2"             \n\t" \
+  "paddh      "#f0", "#f0", "#f4"             \n\t" \
+  "paddh      "#f2", "#f2", "#f6"             \n\t" \
+  "psubh      "#f8", "#f8", "#f4"             \n\t" \
+  "psubh      "#f10", "#f10", "#f6"           \n\t" \
+  "psubh      "#f8", "#f8", "#f4"             \n\t" \
+  "psubh      "#f10", "#f10", "#f6"           \n\t"
+
+#define MMI_DCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22) \
+  MMI_SumSub(f20, f22, f8, f10, f16, f18)   \
+  MMI_SumSub(f0, f2, f4, f6, f16, f18)      \
+  MMI_SumSub(f8, f10, f4, f6, f16, f18)     \
+  MMI_SumSubMul2(f20, f22, f0, f2, f12, f14)
+
+#define MMI_Store4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+  MMI_XSawp_DQ(f0, f2, f4, f6, f16, f18)            \
+  MMI_XSawp_DQ(f8, f10, f12, f14, f4, f6)           \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  "gssqc1     "#f10", "#f8", 0x10("#r0")      \n\t" \
+  "gssqc1     "#f18", "#f16", 0x20("#r0")     \n\t" \
+  "gssqc1     "#f6", "#f4", 0x30("#r0")       \n\t"
+
+#define MMI_LoadDiff4P_SINGLE(f0, f2, r0, r1, f4) \
+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gsldlc1    "#f2", 0x7("#r1")               \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gsldrc1    "#f2", 0x0("#r1")               \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f4"             \n\t" \
+  "punpcklbh  "#f2", "#f2", "#f4"             \n\t" \
+  "psubh      "#f0", "#f0", "#f2"             \n\t"
+
+#define MMI_LoadDiff4x4P_SINGLE(f0, f2, f4, f6, r0, r1, r2, r3, f8, f10) \
+  MMI_LoadDiff4P_SINGLE(f0, f8, r0, r2, f10)        \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \
+  MMI_LoadDiff4P_SINGLE(f2, f8, r0, r2, f10)        \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \
+  MMI_LoadDiff4P_SINGLE(f4, f8, r0, r2, f10)        \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \
+  MMI_LoadDiff4P_SINGLE(f6, f8, r0, r2, f10)
+
+#define MMI_DCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
+  MMI_SumSub_SINGLE(f6, f0, f10)     \
+  MMI_SumSub_SINGLE(f4, f2, f10)     \
+  MMI_SumSub_SINGLE(f4, f6, f10)     \
+  MMI_SumSubMul2_SINGLE(f0, f2, f8, f12)
+
+void WelsIDctT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+                       int32_t iPredStride, int16_t* pDct) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                    \n\t"
+    "gsldlc1    $f0, 0x7(%[pDct])                  \n\t"
+    "gsldrc1    $f0, 0x0(%[pDct])                  \n\t"
+    "gsldlc1    $f2, 0xF(%[pDct])                  \n\t"
+    "gsldrc1    $f2, 0x8(%[pDct])                  \n\t"
+    "gsldlc1    $f4, 0x17(%[pDct])                 \n\t"
+    "gsldrc1    $f4, 0x10(%[pDct])                 \n\t"
+    "gsldlc1    $f6, 0x1F(%[pDct])                 \n\t"
+    "gsldrc1    $f6, 0x18(%[pDct])                 \n\t"
+
+    "dli        $8, 0x1                            \n\t"
+    "dmtc1      $8, $f16                           \n\t"
+    "dli        $8, 0x6                            \n\t"
+    "dmtc1      $8, $f18                           \n\t"
+
+    MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
+    MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f16)
+    MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
+    MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f16)
+
+    "xor        $f14, $f14, $f14                   \n\t"
+    "dli        $8, 0x0020                         \n\t"
+    "dmtc1      $8, $f12                           \n\t"
+    "punpcklhw  $f12, $f12, $f12                   \n\t"
+    "punpcklwd  $f12, $f12, $f12                   \n\t"
+
+    MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+    : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred)
+    : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride),
+      [pDct]"r"((short *)pDct)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18"
+  );
+}
+
+void WelsIDctFourT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+                           int32_t iPredStride, int16_t* pDct) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                    \n\t"
+    MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
+
+    MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
+    "dli        $8, 0x1                            \n\t"
+    "dmtc1      $8, $f30                           \n\t"
+    MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+             $f0, $f2, $f30)
+    MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
+    MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
+             $f4, $f6, $f30)
+
+    "xor        $f28, $f28, $f28                   \n\t"
+    "dli        $8, 0x6                            \n\t"
+    "dmtc1      $8, $f26                           \n\t"
+    "dli        $8, 0x0020                         \n\t"
+    "dmtc1      $8, $f24                           \n\t"
+    "punpcklhw  $f24, $f24, $f24                   \n\t"
+    "punpcklwd  $f24, $f24, $f24                   \n\t"
+
+    MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+
+    PTR_ADDIU  "%[pDct], %[pDct], 0x40             \n\t"
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
+
+    MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
+    MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+             $f0, $f2, $f30)
+    MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
+    MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
+             $f4, $f6, $f30)
+
+    "dli        $8, 0x6                            \n\t"
+    "dmtc1      $8, $f26                           \n\t"
+    "dli        $8, 0x0020                         \n\t"
+    "dmtc1      $8, $f24                           \n\t"
+    "punpcklhw  $f24, $f24, $f24                   \n\t"
+    "punpcklwd  $f24, $f24, $f24                   \n\t"
+
+    MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+    : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
+      [pDct]"+&r"((short *)pDct)
+    : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsIDctRecI16x16Dc_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+                             int32_t iPredStride, int16_t* pDct) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                    \n\t"
+    "xor        $f28, $f28, $f28                   \n\t"
+    "dli        $8, 0x0020                         \n\t"
+    "dmtc1      $8, $f24                           \n\t"
+    "punpcklhw  $f24, $f24, $f24                   \n\t"
+    "punpcklwd  $f24, $f24, $f24                   \n\t"
+    "dli        $8, 0x6                            \n\t"
+    "dmtc1      $8, $f30                           \n\t"
+
+    MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24,
+                %[pDct], 0x0, $f30)
+
+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24, %[pDct], 0x10, $f30)
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+
+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"
+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+                      %[pPred], %[iStride], %[iPredStride])
+    : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
+      [pDct]"+&r"((short *)pDct)
+    : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsHadamardT4Dc_mmi( int16_t *luma_dc, int16_t *pDct) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    MMI_Load4Col($f4, $f6, $f20, $f24, $f0, %[pDct], 0x0)
+    MMI_Load4Col($f8, $f10, $f20, $f24, $f0, %[pDct], 0x40)
+    MMI_Load4Col($f12, $f14, $f20, $f24, $f0, %[pDct], 0x100)
+    MMI_Load4Col($f16, $f18, $f20, $f24, $f0, %[pDct], 0x140)
+
+    MMI_SumSubD($f4, $f6, $f8, $f10, $f28, $f30)
+    MMI_SumSubD($f12, $f14, $f16, $f18, $f28, $f30)
+    MMI_SumSubD($f8, $f10, $f16, $f18, $f28, $f30)
+    MMI_SumSubD($f4, $f6, $f12, $f14, $f28, $f30)
+
+    MMI_Trans4x4W($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f20, $f22)
+
+    MMI_SumSubD($f16, $f18, $f12, $f14, $f28, $f30)
+    MMI_SumSubD($f20, $f22, $f4, $f6, $f28, $f30)
+
+    "dli        $8, 0x1F                        \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+
+    WELS_DD1($f24, $f26, $f30)
+
+    "dli        $8, 0x1                         \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+
+    MMI_SumSubDiv2D($f12, $f14, $f4, $f6, $f24, $f26, $f0, $f2, $f30)
+    MMI_SumSubDiv2D($f16, $f18, $f20, $f22, $f24, $f26, $f4, $f6, $f30)
+    MMI_Trans4x4W($f12, $f14, $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10)
+
+    "packsswh   $f12, $f12, $f14                \n\t"
+    "packsswh   $f14, $f16, $f18                \n\t"
+
+    "packsswh   $f8, $f8, $f10                  \n\t"
+    "packsswh   $f10, $f4, $f6                  \n\t"
+    "gssqc1     $f14, $f12, 0x0(%[luma_dc])     \n\t"
+    "gssqc1     $f10, $f8, 0x10(%[luma_dc])     \n\t"
+   :
+   : [luma_dc]"r"((short *)luma_dc), [pDct]"r"((short *)pDct)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsDctT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
+                   uint8_t *pix2, int32_t i_pix2 ) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "xor        $f14, $f14, $f14                \n\t"
+    "dli        $8, 0x1                         \n\t"
+    "dmtc1      $8, $f16                        \n\t"
+
+    MMI_LoadDiff4x4P_SINGLE($f2, $f4, $f6, $f8, %[pix1], %[i_pix1],
+                            %[pix2], %[i_pix2], $f0, $f14)
+
+    MMI_DCT_SINGLE($f2, $f4, $f6, $f8, $f10, $f12, $f16)
+    MMI_Trans4x4H_SINGLE($f6, $f2, $f8, $f10, $f4)
+
+    MMI_DCT_SINGLE($f6, $f10, $f4, $f8, $f2, $f12, $f16)
+    MMI_Trans4x4H_SINGLE($f4, $f6, $f8, $f2, $f10)
+
+    "gssdlc1    $f4, 0x7(%[pDct])               \n\t"
+    "gssdlc1    $f2, 0xF(%[pDct])               \n\t"
+    "gssdlc1    $f10, 0x17(%[pDct])             \n\t"
+    "gssdlc1    $f8, 0x1F(%[pDct])              \n\t"
+    "gssdrc1    $f4, 0x0(%[pDct])               \n\t"
+    "gssdrc1    $f2, 0x8(%[pDct])               \n\t"
+    "gssdrc1    $f10, 0x10(%[pDct])             \n\t"
+    "gssdrc1    $f8, 0x18(%[pDct])              \n\t"
+   : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
+   : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+     "$f14", "$f16"
+  );
+}
+
+void WelsDctFourT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
+                       uint8_t *pix2, int32_t i_pix2 ) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
+
+    MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
+    MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
+    MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
+    MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
+
+    MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"
+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"
+    MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
+
+    MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
+    MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
+    MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
+    MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
+
+    PTR_ADDIU  "%[pDct], %[pDct], 0x40          \n\t"
+    MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
+   : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
+   : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
+  );
+  RECOVER_REG;
+}
--- /dev/null
+++ b/codec/encoder/core/mips/quant_mmi.c
@@ -1,0 +1,553 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    quant_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "xor        $f10, $f10, $f10                \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
+    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"
+
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+
+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+   :
+   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+  );
+}
+
+void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "xor        $f10, $f10, $f10                \n\t"
+    "dmtc1      %[mf], $f12                     \n\t"
+    "pshufh     $f12, $f12, $f10                \n\t"
+
+    "dmtc1      %[ff], $f8                      \n\t"
+    "pshufh     $f8, $f8, $f10                  \n\t"
+
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f8                   \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f12                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+
+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f8                   \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f12                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+   :
+   : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
+  );
+}
+
+void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
+    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"
+
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+
+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
+   :
+   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+  );
+}
+
+void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
+                             const int16_t *mf, int16_t *max) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
+    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"
+
+    "xor        $f16, $f16, $f16                \n\t"
+    "xor        $f18, $f18, $f18                \n\t"
+    "xor        $f20, $f20, $f20                \n\t"
+    "xor        $f22, $f22, $f22                \n\t"
+    "xor        $f24, $f24, $f24                \n\t"
+    "xor        $f26, $f26, $f26                \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "xor        $f30, $f30, $f30                \n\t"
+
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f16, $f16, $f0                 \n\t"
+    "pmaxsh     $f18, $f18, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+
+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f16, $f16, $f0                 \n\t"
+    "pmaxsh     $f18, $f18, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f20, $f20, $f0                 \n\t"
+    "pmaxsh     $f22, $f22, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f20, $f20, $f0                 \n\t"
+    "pmaxsh     $f22, $f22, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f24, $f24, $f0                 \n\t"
+    "pmaxsh     $f26, $f26, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f24, $f24, $f0                 \n\t"
+    "pmaxsh     $f26, $f26, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f28, $f28, $f0                 \n\t"
+    "pmaxsh     $f30, $f30, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
+
+    "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
+    "xor        $f4, $f4, $f4                   \n\t"
+    "xor        $f6, $f6, $f6                   \n\t"
+    "pcmpgth    $f4, $f4, $f0                   \n\t"
+    "pcmpgth    $f6, $f6, $f2                   \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "paddush    $f0, $f0, $f8                   \n\t"
+    "paddush    $f2, $f2, $f10                  \n\t"
+    "pmulhuh    $f0, $f0, $f12                  \n\t"
+    "pmulhuh    $f2, $f2, $f14                  \n\t"
+    "pmaxsh     $f28, $f28, $f0                 \n\t"
+    "pmaxsh     $f30, $f30, $f2                 \n\t"
+    "xor        $f0, $f0, $f4                   \n\t"
+    "xor        $f2, $f2, $f6                   \n\t"
+    "psubh      $f0, $f0, $f4                   \n\t"
+    "psubh      $f2, $f2, $f6                   \n\t"
+    "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
+
+    "mov.d      $f0, $f18                       \n\t"
+    "punpckhhw  $f18, $f16, $f20                \n\t"
+    "punpcklhw  $f16, $f16, $f20                \n\t"
+    "punpckhhw  $f2, $f0, $f22                  \n\t"
+    "punpcklhw  $f0, $f0, $f22                  \n\t"
+
+    "mov.d      $f20, $f26                      \n\t"
+    "punpckhhw  $f26, $f24, $f28                \n\t"
+    "punpcklhw  $f24, $f24, $f28                \n\t"
+    "punpckhhw  $f22, $f20, $f30                \n\t"
+    "punpcklhw  $f20, $f20, $f30                \n\t"
+
+    "mov.d      $f28, $f18                      \n\t"
+    "punpckhwd  $f18, $f16, $f24                \n\t"
+    "punpcklwd  $f16, $f16, $f24                \n\t"
+    "punpckhwd  $f30, $f28, $f26                \n\t"
+    "punpcklwd  $f28, $f28, $f26                \n\t"
+
+    "mov.d      $f24, $f2                       \n\t"
+    "punpckhwd  $f2, $f0, $f20                  \n\t"
+    "punpcklwd  $f0, $f0, $f20                  \n\t"
+    "punpckhwd  $f26, $f24, $f22                \n\t"
+    "punpcklwd  $f24, $f24, $f22                \n\t"
+
+    "mov.d      $f20, $f18                      \n\t"
+    "mov.d      $f18, $f0                       \n\t"
+    "mov.d      $f22, $f2                       \n\t"
+
+    "mov.d      $f0, $f30                       \n\t"
+    "mov.d      $f30, $f24                      \n\t"
+    "mov.d      $f2, $f26                       \n\t"
+
+    "pmaxsh     $f0, $f0, $f16                  \n\t"
+    "pmaxsh     $f2, $f2, $f18                  \n\t"
+
+    "pmaxsh     $f0, $f0, $f20                  \n\t"
+    "pmaxsh     $f2, $f2, $f22                  \n\t"
+
+    "pmaxsh     $f0, $f0, $f28                  \n\t"
+    "pmaxsh     $f2, $f2, $f30                  \n\t"
+
+    "mov.d      $f4, $f0                        \n\t"
+    "mov.d      $f6, $f2                        \n\t"
+
+    "mov.d      $f0, $f2                        \n\t"
+    "mov.d      $f2, $f6                        \n\t"
+
+    "pmaxsh     $f0, $f0, $f4                   \n\t"
+    "pmaxsh     $f2, $f2, $f6                   \n\t"
+
+    "gssdlc1    $f0, 0x7(%[max])                \n\t"
+    "gssdrc1    $f0, 0x0(%[max])                \n\t"
+   :
+   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
+     [max]"r"((short *)max)
+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
+     "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
--- /dev/null
+++ b/codec/encoder/core/mips/score_mmi.c
@@ -1,0 +1,324 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    score_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    21/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+unsigned char nozero_count_table[] __attribute__((aligned(16))) = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+    3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+int32_t WelsGetNoneZeroCount_mmi(int16_t *level) {
+  int ret_val = 0;
+  __asm__ volatile(
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[level])         \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[level])        \n\t"
+    "xor        $f8, $f8, $f8                   \n\t"
+    "pcmpeqh    $f0, $f0, $f8                   \n\t"
+    "pcmpeqh    $f2, $f2, $f8                   \n\t"
+    "pcmpeqh    $f4, $f4, $f8                   \n\t"
+    "pcmpeqh    $f6, $f6, $f8                   \n\t"
+    "packsshb   $f4, $f4, $f6                   \n\t"
+    "packsshb   $f6, $f0, $f2                   \n\t"
+    "pmovmskb   $f0, $f4                        \n\t"
+    "pmovmskb   $f2, $f6                        \n\t"
+    "dmfc1      $8, $f0                         \n\t"
+    "dmfc1      $9, $f2                         \n\t"
+    "xor        $8, 0xFF                        \n\t"
+    "xor        $9, 0xFF                        \n\t"
+    PTR_ADDU   "$10, $8, %[nozero_count_table]  \n\t"
+    "lbu        $8, 0x0($10)                    \n\t"
+    PTR_ADDU   "$10, $9, %[nozero_count_table]  \n\t"
+    "lbu        $9, 0x0($10)                    \n\t"
+    PTR_ADDU   "%[ret_val], $8, $9              \n\t"
+    : [ret_val] "=r"((int)ret_val)
+    : [level] "r"((unsigned char *)level),
+      [nozero_count_table] "r"((unsigned char *)nozero_count_table)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8"
+  );
+  return ret_val;
+}
+
+void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) {
+  BACKUP_REG;
+  __asm__ volatile(
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"
+    "dli        $8, 0x3                         \n\t"
+    "dmtc1      $8, $f22                        \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+    "dli        $8, 0x1                         \n\t"
+    "dmtc1      $8, $f26                        \n\t"
+    "dmtc1      $0, $f28                        \n\t"
+    "pextrh     $f18, $f2, $f22                 \n\t"
+    "pextrh     $f20, $f4, $f24                 \n\t"
+    "pextrh     $f16, $f2, $f26                 \n\t"
+    "pinsrh_2   $f4, $f4, $f18                  \n\t"
+    "pinsrh_3   $f2, $f2, $f16                  \n\t"
+    "pextrh     $f18, $f4, $f28                 \n\t"
+    "pinsrh_1   $f2, $f2, $f18                  \n\t"
+    "pinsrh_0   $f4, $f4, $f20                  \n\t"
+    "dli        $8, 0x93                        \n\t"
+    "dmtc1      $8, $f22                        \n\t"
+    "dli        $8, 0x39                        \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+    "punpckhwd  $f10, $f0, $f2                  \n\t"
+    "punpcklwd  $f8, $f0, $f2                   \n\t"
+    "punpckhwd  $f14, $f4, $f6                  \n\t"
+    "punpcklwd  $f12, $f4, $f6                  \n\t"
+    "mov.d      $f0, $f8                        \n\t"
+    "pshufh     $f2, $f10, $f22                 \n\t"
+    "pshufh     $f4, $f12, $f24                 \n\t"
+    "mov.d      $f6, $f14                       \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[level])         \n\t"
+    "gssqc1     $f6, $f4, 0x10(%[level])        \n\t"
+    :
+    : [level] "r"((short *)level), [pDct] "r"((short *)pDct)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
+  );
+  RECOVER_REG;
+}
+
+void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) {
+  BACKUP_REG;
+  __asm__ volatile(
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"
+    "mov.d      $f8, $f2                        \n\t"
+    "mov.d      $f2, $f4                        \n\t"
+    "mov.d      $f10, $f6                       \n\t"
+
+    "mov.d      $f12, $f2                       \n\t"
+    "punpckhwd  $f2, $f0, $f8                   \n\t"
+    "punpcklwd  $f0, $f0, $f8                   \n\t"
+    "punpckhwd  $f14, $f12, $f10                \n\t"
+    "punpcklwd  $f12, $f12, $f10                \n\t"
+
+    "dmtc1      $0, $f20                        \n\t"
+    "dli        $8, 0x10                        \n\t"
+    "dmtc1      $8, $f22                        \n\t"
+    "dli        $8, 0x30                        \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+    "dli        $8, 0x3                         \n\t"
+    "dmtc1      $8, $f26                        \n\t"
+    "dli        $8, 0x93                        \n\t"
+    "dmtc1      $8, $f28                        \n\t"
+    "dli        $8, 0x39                        \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+    "pextrh     $f16, $f0, $f26                 \n\t"
+    "pextrh     $f18, $f2, $f26                 \n\t"
+    "pinsrh_3   $f2, $f2, $f16                  \n\t"
+    "pextrh     $f16, $f14, $f20                \n\t"
+    "pinsrh_0   $f14, $f14, $f18                \n\t"
+    "pextrh     $f18, $f12, $f20                \n\t"
+    "pinsrh_0   $f12, $f12, $f16                \n\t"
+    "pinsrh_3   $f0, $f0, $f18                  \n\t"
+
+    "mov.d      $f4, $f0                        \n\t"
+    "pshufh     $f6, $f2, $f28                  \n\t"
+    "pshufh     $f8, $f12, $f30                 \n\t"
+    "mov.d      $f10, $f14                      \n\t"
+
+    "mov.d      $f12, $f8                       \n\t"
+    "mov.d      $f14, $f10                      \n\t"
+    "dsrl       $f4, $f4, $f22                  \n\t"
+    "pinsrh_3   $f4, $f4, $f6                   \n\t"
+    "dsrl       $f6, $f6, $f22                  \n\t"
+    "dsll       $f14, $f12, $f24                \n\t"
+    "xor        $f12, $f12, $f12                \n\t"
+    "or         $f4, $f4, $f12                  \n\t"
+    "or         $f6, $f6, $f14                  \n\t"
+    "dsrl       $f8, $f8, $f22                  \n\t"
+    "pinsrh_3   $f8, $f8, $f10                  \n\t"
+    "dsrl       $f10, $f10, $f22                \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[zig_value])     \n\t"
+    "gssqc1     $f10, $f8, 0x10(%[zig_value])   \n\t"
+    :
+    : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+unsigned char i_ds_table[]__attribute__((aligned(16))) = {
+      3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned char high_mask_table[]__attribute__((aligned(16))) = {
+      0, 0, 0, 3, 0, 2, 3, 6, 0, 2,
+      2, 5, 3, 5, 6, 9, 0, 1, 2, 5,
+      2, 4, 5, 8, 3, 5, 5, 8, 6, 8,
+      9,12, 0, 1, 1, 4, 2, 4, 5, 8,
+      2, 4, 4, 7, 5, 7, 8,11, 3, 4,
+      5, 8, 5, 7, 8,11, 6, 8, 8,11,
+      9,11,12,15, 0, 1, 1, 4, 1, 3,
+      4, 7, 2, 4, 4, 7, 5, 7, 8,11,
+      2, 3, 4, 7, 4, 6, 7,10, 5, 7,
+      7,10, 8,10,11,14, 3, 4, 4, 7,
+      5, 7, 8,11, 5, 7, 7,10, 8,10,
+     11,14, 6, 7, 8,11, 8,10,11,14,
+      9,11,11,14,12,14,15,18, 0, 0,
+      1, 4, 1, 3, 4, 7, 1, 3, 3, 6,
+      4, 6, 7,10, 2, 3, 4, 7, 4, 6,
+      7,10, 5, 7, 7,10, 8,10,11,14,
+      2, 3, 3, 6, 4, 6, 7,10, 4, 6,
+      6, 9, 7, 9,10,13, 5, 6, 7,10,
+      7, 9,10,13, 8,10,10,13,11,13,
+     14,17, 3, 4, 4, 7, 4, 6, 7,10,
+      5, 7, 7,10, 8,10,11,14, 5, 6,
+      7,10, 7, 9,10,13, 8,10,10,13,
+     11,13,14,17, 6, 7, 7,10, 8,10,
+     11,14, 8,10,10,13,11,13,14,17,
+      9,10,11,14,11,13,14,17,12,14,
+     14,17,15,17,18,21};
+
+unsigned char low_mask_table[]__attribute__((aligned(16))) = {
+      0, 3, 2, 6, 2, 5, 5, 9, 1, 5,
+      4, 8, 5, 8, 8,12, 1, 4, 4, 8,
+      4, 7, 7,11, 4, 8, 7,11, 8,11,
+     11,15, 1, 4, 3, 7, 4, 7, 7,11,
+      3, 7, 6,10, 7,10,10,14, 4, 7,
+      7,11, 7,10,10,14, 7,11,10,14,
+     11,14,14,18, 0, 4, 3, 7, 3, 6,
+      6,10, 3, 7, 6,10, 7,10,10,14,
+      3, 6, 6,10, 6, 9, 9,13, 6,10,
+      9,13,10,13,13,17, 4, 7, 6,10,
+      7,10,10,14, 6,10, 9,13,10,13,
+     13,17, 7,10,10,14,10,13,13,17,
+     10,14,13,17,14,17,17,21, 0, 3,
+      3, 7, 3, 6, 6,10, 2, 6, 5, 9,
+      6, 9, 9,13, 3, 6, 6,10, 6, 9,
+      9,13, 6,10, 9,13,10,13,13,17,
+      3, 6, 5, 9, 6, 9, 9,13, 5, 9,
+      8,12, 9,12,12,16, 6, 9, 9,13,
+      9,12,12,16, 9,13,12,16,13,16,
+     16,20, 3, 7, 6,10, 6, 9, 9,13,
+      6,10, 9,13,10,13,13,17, 6, 9,
+      9,13, 9,12,12,16, 9,13,12,16,
+     13,16,16,20, 7,10, 9,13,10,13,
+     13,17, 9,13,12,16,13,16,16,20,
+     10,13,13,17,13,16,16,20,13,17,
+     16,20,17,20,20,24};
+
+int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) {
+  int32_t iSingleCtr = 0;
+  __asm__ volatile(
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"
+    "packsshb   $f0, $f0, $f2                   \n\t"
+    "packsshb   $f2, $f4, $f6                   \n\t"
+
+    "xor        $f10, $f10, $f10                \n\t"
+    "xor        $f8, $f8, $f8                   \n\t"
+
+    "pcmpeqb    $f0, $f0, $f8                   \n\t"
+    "pcmpeqb    $f2, $f2, $f8                   \n\t"
+
+    "pmovmskb   $f10, $f0                       \n\t"
+    "pmovmskb   $f12, $f2                       \n\t"
+    "punpcklbh  $f10, $f10, $f12                \n\t"
+
+    "dmfc1      $12, $f10                       \n\t"
+    "dli        $8, 0xffff                      \n\t"
+    "xor        $12, $12, $8                    \n\t"
+
+    "xor        %[pDct], %[pDct], %[pDct]       \n\t"
+    "dli        $8, 0x80                        \n\t"
+    "dli        $9, 0x7                         \n\t"
+    "dli        $10, 0x100                      \n\t"
+    "dli        $11, 0x8                        \n\t"
+
+    "1:                                         \n\t"
+    "and        $13, $12, $8                    \n\t"
+    "bnez       $13, 2f                         \n\t"
+    "nop                                        \n\t"
+    "daddiu     $9, -0x1                        \n\t"
+    "dsrl       $8, 1                           \n\t"
+    "bnez       $9, 1b                          \n\t"
+    "nop                                        \n\t"
+    "2:                                         \n\t"
+    "and        $13, $12, $10                   \n\t"
+    "bnez       $13, 3f                         \n\t"
+    "nop                                        \n\t"
+    "daddiu     $11, 0x1                        \n\t"
+    "dsll       $10, 1                          \n\t"
+    "daddiu     $13, $11, -0x10                 \n\t"
+    "bltz       $13, 2b                         \n\t"
+    "nop                                        \n\t"
+    "3:                                         \n\t"
+    "dsubu      $11, $11, $9                    \n\t"
+    "daddiu     $11, -0x1                       \n\t"
+    PTR_ADDU   "$8, %[i_ds_table], $11          \n\t"
+    "lb         $10, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDct], %[pDct], $10           \n\t"
+    "move       $11, $12                        \n\t"
+    "dli        $10, 0xff                       \n\t"
+    "and        $12, $10                        \n\t"
+    "dsrl       $11, 0x8                        \n\t"
+    "and        $11, $10                        \n\t"
+    PTR_ADDU   "$8, %[low_mask_table], $12      \n\t"
+    "lb         $10, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDct], %[pDct], $10           \n\t"
+    PTR_ADDU   "$8, %[high_mask_table], $11     \n\t"
+    "lb         $10, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[iSingleCtr], %[pDct], $10     \n\t"
+    : [iSingleCtr] "=r"(iSingleCtr)
+    : [pDct] "r"((short *)pDct),
+      [i_ds_table] "r"((unsigned char *)i_ds_table),
+      [high_mask_table] "r"((unsigned char *)high_mask_table),
+      [low_mask_table] "r"((unsigned char *)low_mask_table)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12"
+  );
+  return iSingleCtr;
+}
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -302,5 +302,13 @@
     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_AArch64_neon;
   }
 #endif
+
+#if defined(HAVE_MMI)
+  if (uiCpuFlag & WELS_CPU_MMI) {
+    pFuncList->pfIDctT4         = WelsIDctT4Rec_mmi;
+    pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_mmi;
+    pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_mmi;
+  }
+#endif//HAVE_MMI
 }
 }
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -592,9 +592,24 @@
     pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_mmi;
     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmi;
 
+    pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_mmi;
+    pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;
+
+    pFuncList->pfQuantization4x4        = WelsQuant4x4_mmi;
+    pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_mmi;
+    pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_mmi;
+    pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;
+
     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_mmi;
     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_mmi;
     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_mmi;
+
+    pFuncList->pfScan4x4                = WelsScan4x4DcAc_mmi;
+    pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_mmi;
+    pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_mmi;
+
+    pFuncList->pfDctT4                  = WelsDctT4_mmi;
+    pFuncList->pfDctFourT4              = WelsDctFourT4_mmi;
   }
 #endif//HAVE_MMI
 }
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -79,10 +79,24 @@
 endif
 OBJS += $(ENCODER_OBJSARM64)
 
+ENCODER_ASM_MIPS_SRCS=\
+	$(ENCODER_SRCDIR)/core/mips/dct_mmi.c\
+	$(ENCODER_SRCDIR)/core/mips/quant_mmi.c\
+	$(ENCODER_SRCDIR)/core/mips/score_mmi.c\
+
+ENCODER_OBJSMIPS += $(ENCODER_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+ENCODER_OBJS += $(ENCODER_OBJSMIPS)
+endif
+OBJS += $(ENCODER_OBJSMIPS)
+
 OBJS += $(ENCODER_OBJS)
 
 $(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
+
+$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
 
 $(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.asm
 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $@ $<
--- a/test/common/targets.mk
+++ b/test/common/targets.mk
@@ -2,8 +2,8 @@
 COMMON_UNITTEST_CPP_SRCS=\
 	$(COMMON_UNITTEST_SRCDIR)/CWelsListTest.cpp\
 	$(COMMON_UNITTEST_SRCDIR)/ExpandPicture.cpp\
-	$(COMMON_UNITTEST_SRCDIR)/WelsThreadPoolTest.cpp\
 	$(COMMON_UNITTEST_SRCDIR)/WelsTaskListTest.cpp\
+	$(COMMON_UNITTEST_SRCDIR)/WelsThreadPoolTest.cpp\
 
 COMMON_UNITTEST_OBJS += $(COMMON_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))
 
--- a/test/encoder/EncUT_DecodeMbAux.cpp
+++ b/test/encoder/EncUT_DecodeMbAux.cpp
@@ -246,6 +246,11 @@
 }
 #endif
 #endif
+#if defined(HAVE_MMI)
+TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmi) {
+  TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmi);
+}
+#endif
 template<typename clip_t>
 void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {
   WelsIDctT4Anchor<clip_t> (&p_dst[0],                   dct[0]);
@@ -367,6 +372,42 @@
                                           14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
     WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
     WelsIDctRecI16x16Dc_sse2 (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
+    int ok = -1;
+    for (int i = 0; i < 16; i++) {
+      for (int j = 0; j < 16; j++) {
+        if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
+          ok = i * 16 + j;
+          break;
+        }
+      }
+    }
+    EXPECT_EQ (ok, -1);
+  }
+}
+#endif
+#if defined(HAVE_MMI)
+TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_mmi) {
+  TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_mmi);
+}
+TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_mmi) {
+  int32_t iCpuCores = 0;
+  uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+
+  if (uiCpuFeatureFlag & WELS_CPU_MMI) {
+    uint8_t iRefDst[16 * FDEC_STRIDE];
+    int16_t iRefDct[4][4];
+    ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
+    ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
+    ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
+    for (int i = 0; i < 16; i++)
+      for (int j = 0; j < 16; j++)
+        iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;
+    for (int i = 0; i < 4; i++)
+      for (int j = 0; j < 4; j++)
+        iRefDct[i][j] = iDct[i * 4 + j] = (rand() & ((1 << 15) - 1)) - (1 <<
+                                          14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
+    WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
+    WelsIDctRecI16x16Dc_mmi (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
     int ok = -1;
     for (int i = 0; i < 16; i++) {
       for (int j = 0; j < 16; j++) {
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -315,6 +315,11 @@
     TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
 }
 #endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_mmi) {
+  TestGetNoneZeroCount (WelsGetNoneZeroCount_mmi);
+}
+#endif
 #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
 #define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16
 #define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))
@@ -478,6 +483,24 @@
 }
 #endif //HAVE_AVX2
 #endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsQuant4x4_mmi) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+    TestWelsQuant4x4 (WelsQuant4x4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_mmi) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+    TestWelsQuant4x4Dc (WelsQuant4x4Dc_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_mmi) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+    TestWelsQuantFour4x4 (WelsQuantFour4x4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_mmi) {
+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+    TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_mmi);
+}
+#endif //HAVE_MMI
 int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff,  int16_t mf) {
   int16_t pDct[4], s[4];
   int16_t threshold = ((1 << 16) - 1) / mf - ff;
@@ -604,6 +627,23 @@
     iDct[i] = (rand() & 32767) - 16384;
   WelsHadamardT4Dc_c (iLumaDcC, iDct);
   WelsHadamardT4Dc_sse2 (iLumaDcS, iDct);
+  for (int i = 0; i < 16; i++)
+    EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);
+  FREE_MEMORY (iDct);
+  FREE_MEMORY (iLumaDcC);
+  FREE_MEMORY (iLumaDcS);
+}
+#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsHadamardT4Dc_mmi) {
+  CMemoryAlign cMemoryAlign (0);
+  ALLOC_MEMORY (int16_t, iDct, 128 * 16);
+  ALLOC_MEMORY (int16_t, iLumaDcC, 16);
+  ALLOC_MEMORY (int16_t, iLumaDcS, 16);
+  for (int i = 0; i < 128 * 16; i++)
+    iDct[i] = (rand() & 32767) - 16384;
+  WelsHadamardT4Dc_c (iLumaDcC, iDct);
+  WelsHadamardT4Dc_mmi (iLumaDcS, iDct);
   for (int i = 0; i < 16; i++)
     EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);
   FREE_MEMORY (iDct);
--- a/test/encoder/targets.mk
+++ b/test/encoder/targets.mk
@@ -17,8 +17,8 @@
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_ParameterSetStrategy.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\
-	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SliceBufferReallocate.cpp\
+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\
 
 ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))