shithub: openh264

Download patch

ref: 14750b4797a011da07c8cb718b13aa3a02ac3d84
parent: 906dacd34972e42819dab320960ffcfb7b84aada
author: gxw <guxiwei-hf@loongson.cn>
date: Mon Aug 13 05:23:03 EDT 2018

Add optimization files for loongson platform

1. Add dct_mmi.c in codec/decoder/core/mips
2. Add vaa_mmi.c in codec/processing/src/mips
3. Add optimization functions in codec/common/src/mc.cpp

Change-Id: I9060a5f42ac7903b377b48ef7fe92809a2ba4481

diff: cannot open b/codec/decoder/core/mips//null: file does not exist: 'b/codec/decoder/core/mips//null' diff: cannot open b/codec/processing/src/mips//null: file does not exist: 'b/codec/processing/src/mips//null'
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -43,6 +43,7 @@
 #include "cpu_core.h"
 #include "ls_defines.h"
 #include "macros.h"
+#include "asmdefs_mmi.h"
 
 namespace {
 
@@ -1659,6 +1660,2541 @@
 }
 #endif
 
+#if defined(HAVE_MMI)
+#define MMI_LOAD_8P(f0, f2, f4, r0) \
+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+  "punpckhbh  "#f2", "#f0", "#f4"             \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f4"             \n\t"
+
+#define FILTER_HV_W4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+                     f20, f22, f24, f26, f28, f30, r0, r1, r2) \
+  "paddh      "#f0", "#f0", "#f20"            \n\t" \
+  "paddh      "#f2", "#f2", "#f22"            \n\t" \
+  "mov.d      "#f28", "#f8"                   \n\t" \
+  "mov.d      "#f30", "#f10"                  \n\t" \
+  "mov.d      "#f24", "#f4"                   \n\t" \
+  "mov.d      "#f26", "#f6"                   \n\t" \
+  "dmfc1      "#r2", "#f8"                    \n\t" \
+  "dli        "#r1", 0x0010001000100010       \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "paddh      "#f0", "#f0", "#f8"             \n\t" \
+  "paddh      "#f2", "#f2", "#f8"             \n\t" \
+  "paddh      "#f28", "#f28", "#f12"          \n\t" \
+  "paddh      "#f30", "#f30", "#f14"          \n\t" \
+  "paddh      "#f24", "#f24", "#f16"          \n\t" \
+  "paddh      "#f26", "#f26", "#f18"          \n\t" \
+  "dli        "#r1", 0x2                      \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "psllh      "#f28", "#f28", "#f8"           \n\t" \
+  "psllh      "#f30", "#f30", "#f8"           \n\t" \
+  "psubh      "#f28", "#f28", "#f24"          \n\t" \
+  "psubh      "#f30", "#f30", "#f26"          \n\t" \
+  "paddh      "#f0", "#f0", "#f28"            \n\t" \
+  "paddh      "#f2", "#f2", "#f30"            \n\t" \
+  "psllh      "#f28", "#f28", "#f8"           \n\t" \
+  "psllh      "#f30", "#f30", "#f8"           \n\t" \
+  "paddh      "#f0", "#f0", "#f28"            \n\t" \
+  "paddh      "#f2", "#f2", "#f30"            \n\t" \
+  "dli        "#r1", 0x5                      \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "psrah      "#f2", "#f2", "#f8"             \n\t" \
+  "xor        "#f28", "#f28", "#f28"          \n\t" \
+  "packushb   "#f0", "#f0", "#f2"             \n\t" \
+  "gsswlc1    "#f0", 0x3("#r0")               \n\t" \
+  "gsswrc1    "#f0", 0x0("#r0")               \n\t" \
+  "dmtc1      "#r2", "#f8"                    \n\t"
+
+#define FILTER_HV_W8(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+                     f20, f22, f24, f26, f28, f30, r0, r1, r2) \
+  "paddh      "#f0", "#f0", "#f20"            \n\t" \
+  "paddh      "#f2", "#f2", "#f22"            \n\t" \
+  "mov.d      "#f28", "#f8"                   \n\t" \
+  "mov.d      "#f30", "#f10"                  \n\t" \
+  "mov.d      "#f24", "#f4"                   \n\t" \
+  "mov.d      "#f26", "#f6"                   \n\t" \
+  "dmfc1      "#r2", "#f8"                    \n\t" \
+  "dli        "#r1", 0x0010001000100010       \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "paddh      "#f0", "#f0", "#f8"             \n\t" \
+  "paddh      "#f2", "#f2", "#f8"             \n\t" \
+  "paddh      "#f28", "#f28", "#f12"          \n\t" \
+  "paddh      "#f30", "#f30", "#f14"          \n\t" \
+  "paddh      "#f24", "#f24", "#f16"          \n\t" \
+  "paddh      "#f26", "#f26", "#f18"          \n\t" \
+  "dli        "#r1", 0x2                      \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "psllh      "#f28", "#f28", "#f8"           \n\t" \
+  "psllh      "#f30", "#f30", "#f8"           \n\t" \
+  "psubh      "#f28", "#f28", "#f24"          \n\t" \
+  "psubh      "#f30", "#f30", "#f26"          \n\t" \
+  "paddh      "#f0", "#f0", "#f28"            \n\t" \
+  "paddh      "#f2", "#f2", "#f30"            \n\t" \
+  "psllh      "#f28", "#f28", "#f8"           \n\t" \
+  "psllh      "#f30", "#f30", "#f8"           \n\t" \
+  "paddh      "#f0", "#f0", "#f28"            \n\t" \
+  "paddh      "#f2", "#f2", "#f30"            \n\t" \
+  "dli        "#r1", 0x5                      \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "psrah      "#f2", "#f2", "#f8"             \n\t" \
+  "xor        "#f28", "#f28", "#f28"          \n\t" \
+  "packushb   "#f0", "#f0", "#f2"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "dmtc1      "#r2", "#f8"                    \n\t"
+
+#define FILTER_VER_ALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+                         f20, f22, f24, f26, f28, f30, r0, r1, r2, r3, r4) \
+  "paddh      "#f0", "#f0", "#f20"            \n\t" \
+  "paddh      "#f2", "#f2", "#f22"            \n\t" \
+  "mov.d      "#f24", "#f4"                   \n\t" \
+  "mov.d      "#f26", "#f6"                   \n\t" \
+  "mov.d      "#f28", "#f8"                   \n\t" \
+  "mov.d      "#f30", "#f10"                  \n\t" \
+  "dli        "#r2", 0x2                      \n\t" \
+  "paddh      "#f24", "#f24", "#f16"          \n\t" \
+  "paddh      "#f26", "#f26", "#f18"          \n\t" \
+  "dmfc1      "#r3", "#f8"                    \n\t" \
+  "paddh      "#f28", "#f28", "#f12"          \n\t" \
+  "paddh      "#f30", "#f30", "#f14"          \n\t" \
+  "dmtc1      "#r2", "#f8"                    \n\t" \
+  "psubh      "#f0", "#f0", "#f24"            \n\t" \
+  "psubh      "#f2", "#f2", "#f26"            \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "psrah      "#f2", "#f2", "#f8"             \n\t" \
+  "paddh      "#f0", "#f0", "#f28"            \n\t" \
+  "paddh      "#f2", "#f2", "#f30"            \n\t" \
+  "psubh      "#f0", "#f0", "#f24"            \n\t" \
+  "psubh      "#f2", "#f2", "#f26"            \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "psrah      "#f2", "#f2", "#f8"             \n\t" \
+  "dmtc1      "#r4", "#f8"                    \n\t" \
+  "paddh      "#f28", "#f28", "#f0"           \n\t" \
+  "paddh      "#f30", "#f30", "#f2"           \n\t" \
+  "dli        "#r2", 0x6                      \n\t" \
+  "paddh      "#f28", "#f28", "#f8"           \n\t" \
+  "paddh      "#f30", "#f30", "#f8"           \n\t" \
+  "dmtc1      "#r2", "#f8"                    \n\t" \
+  "psrah      "#f28", "#f28", "#f8"           \n\t" \
+  "psrah      "#f30", "#f30", "#f8"           \n\t" \
+  "packushb   "#f28", "#f28", "#f30"          \n\t" \
+  "gssdxc1    "#f28", 0x0("#r0", "#r1")       \n\t" \
+  "dmtc1      "#r3", "#f8"                    \n\t"
+
+#define FILTER_VER_UNALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+                           f20, f22, f24, f26, f28, f30, r0, r1, r2, r3) \
+  "paddh      "#f0", "#f0", "#f20"            \n\t" \
+  "paddh      "#f2", "#f2", "#f22"            \n\t" \
+  "mov.d      "#f24", "#f4"                   \n\t" \
+  "mov.d      "#f26", "#f6"                   \n\t" \
+  "mov.d      "#f28", "#f8"                   \n\t" \
+  "mov.d      "#f30", "#f10"                  \n\t" \
+  "dli        "#r1", 0x2                      \n\t" \
+  "paddh      "#f24", "#f24", "#f16"          \n\t" \
+  "paddh      "#f26", "#f26", "#f18"          \n\t" \
+  "dmfc1      "#r2", "#f8"                    \n\t" \
+  "paddh      "#f28", "#f28", "#f12"          \n\t" \
+  "paddh      "#f30", "#f30", "#f14"          \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "psubh      "#f0", "#f0", "#f24"            \n\t" \
+  "psubh      "#f2", "#f2", "#f26"            \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "psrah      "#f2", "#f2", "#f8"             \n\t" \
+  "paddh      "#f0", "#f0", "#f28"            \n\t" \
+  "paddh      "#f2", "#f2", "#f30"            \n\t" \
+  "psubh      "#f0", "#f0", "#f24"            \n\t" \
+  "psubh      "#f2", "#f2", "#f26"            \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "psrah      "#f2", "#f2", "#f8"             \n\t" \
+  "dmtc1      "#r3", "#f8"                    \n\t" \
+  "paddh      "#f28", "#f28", "#f0"           \n\t" \
+  "paddh      "#f30", "#f30", "#f2"           \n\t" \
+  "dli        "#r1", 0x6                      \n\t" \
+  "paddh      "#f28", "#f28", "#f8"           \n\t" \
+  "paddh      "#f30", "#f30", "#f8"           \n\t" \
+  "dmtc1      "#r1", "#f8"                    \n\t" \
+  "psrah      "#f28", "#f28", "#f8"           \n\t" \
+  "psrah      "#f30", "#f30", "#f8"           \n\t" \
+  "packushb   "#f28", "#f28", "#f30"          \n\t" \
+  "gssdlc1    "#f28", 0x7("#r0")              \n\t" \
+  "gssdrc1    "#f28", 0x0("#r0")              \n\t" \
+  "dmtc1      "#r2", "#f8"                    \n\t"
+
+void McHorVer20Width5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+                          int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dli        $10, 0x0010001000100010         \n\t"
+    "dli        $11, 0x5                        \n\t"
+    "1:                                         \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "mov.d      $f28, $f8                       \n\t"
+    "mov.d      $f30, $f10                      \n\t"
+    "paddh      $f28, $f28, $f12                \n\t"
+    "paddh      $f30, $f30, $f14                \n\t"
+    "mov.d      $f24, $f16                      \n\t"
+    "mov.d      $f26, $f18                      \n\t"
+    "paddh      $f24, $f24, $f20                \n\t"
+    "paddh      $f26, $f26, $f22                \n\t"
+    "dmfc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f12                        \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "psubh      $f24, $f24, $f28                \n\t"
+    "psubh      $f26, $f26, $f30                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+
+    "dmtc1      $10, $f12                       \n\t"
+    "paddh      $f0, $f0, $f12                  \n\t"
+    "paddh      $f2, $f2, $f12                  \n\t"
+    "dmtc1      $11, $f12                       \n\t"
+    "psrah      $f0, $f0, $f12                  \n\t"
+    "psrah      $f2, $f2, $f12                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+
+    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
+    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
+
+    "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "dmtc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+
+    "paddh      $f16, $f16, $f4                 \n\t"
+    "paddh      $f18, $f18, $f6                 \n\t"
+    "paddh      $f20, $f20, $f12                \n\t"
+    "paddh      $f22, $f22, $f14                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "psubh      $f20, $f20, $f16                \n\t"
+    "psubh      $f22, $f22, $f18                \n\t"
+    "paddh      $f8, $f8, $f0                   \n\t"
+    "paddh      $f10, $f10, $f2                 \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+
+    "dmtc1      $10, $f24                       \n\t"
+    "paddh      $f8, $f8, $f24                  \n\t"
+    "paddh      $f10, $f10, $f24                \n\t"
+    "dmtc1      $11, $f24                       \n\t"
+    "psrah      $f8, $f8, $f24                  \n\t"
+    "psrah      $f10, $f10, $f24                \n\t"
+    "packushb   $f8, $f8, $f10                  \n\t"
+    "gsswlc1    $f8, 0x4(%[pDst])               \n\t"
+    "gsswrc1    $f8, 0x1(%[pDst])               \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    : [iSrcStride]"r"((int)iSrcStride),  [iDstStride]"r"((int)iDstStride)
+    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void McHorVer20Width9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+                              int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dli        $9, 0x9                         \n\t"
+    "dli        $10, 0x0010001000100010         \n\t"
+    "dli        $11, 0x5                        \n\t"
+    "bne        %[iWidth], $9, 2f               \n\t"
+    "1:                                         \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "mov.d      $f28, $f8                       \n\t"
+    "mov.d      $f30, $f10                      \n\t"
+    "paddh      $f28, $f28, $f12                \n\t"
+    "paddh      $f30, $f30, $f14                \n\t"
+    "mov.d      $f24, $f16                      \n\t"
+    "mov.d      $f26, $f18                      \n\t"
+    "paddh      $f24, $f24, $f20                \n\t"
+    "paddh      $f26, $f26, $f22                \n\t"
+    "dmfc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f12                        \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "psubh      $f24, $f24, $f28                \n\t"
+    "psubh      $f26, $f26, $f30                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+
+    "dmtc1      $10, $f12                       \n\t"
+    "paddh      $f0, $f0, $f12                  \n\t"
+    "paddh      $f2, $f2, $f12                  \n\t"
+    "dmtc1      $11, $f12                       \n\t"
+    "psrah      $f0, $f0, $f12                  \n\t"
+    "psrah      $f2, $f2, $f12                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+
+    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
+    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
+
+    "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "dmtc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+
+    "paddh      $f16, $f16, $f4                 \n\t"
+    "paddh      $f18, $f18, $f6                 \n\t"
+    "paddh      $f20, $f20, $f12                \n\t"
+    "paddh      $f22, $f22, $f14                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "psubh      $f20, $f20, $f16                \n\t"
+    "psubh      $f22, $f22, $f18                \n\t"
+    "paddh      $f8, $f8, $f0                   \n\t"
+    "paddh      $f10, $f10, $f2                 \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+
+    "dmtc1      $10, $f24                       \n\t"
+    "paddh      $f8, $f8, $f24                  \n\t"
+    "paddh      $f10, $f10, $f24                \n\t"
+    "dmtc1      $11, $f24                       \n\t"
+    "psrah      $f8, $f8, $f24                  \n\t"
+    "psrah      $f10, $f10, $f24                \n\t"
+    "packushb   $f8, $f8, $f10                  \n\t"
+    "gssdlc1    $f8, 0x8(%[pDst])               \n\t"
+    "gssdrc1    $f8, 0x1(%[pDst])               \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    "j          3f                              \n\t"
+
+    "2:                                         \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "dmtc1      $8, $f30                        \n\t"
+    "paddh      $f8, $f8, $f12                  \n\t"
+    "paddh      $f10, $f10, $f14                \n\t"
+    "paddh      $f16, $f16, $f20                \n\t"
+    "paddh      $f18, $f18, $f22                \n\t"
+    "psllh      $f16, $f16, $f30                \n\t"
+    "psllh      $f18, $f18, $f30                \n\t"
+    "psubh      $f16, $f16, $f8                 \n\t"
+    "psubh      $f18, $f18, $f10                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "psllh      $f16, $f16, $f30                \n\t"
+    "psllh      $f18, $f18, $f30                \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+
+    "dmtc1      $10, $f30                       \n\t"
+    "paddh      $f0, $f0, $f30                  \n\t"
+    "paddh      $f2, $f2, $f30                  \n\t"
+    "dmtc1      $11, $f30                       \n\t"
+    "psrah      $f0, $f0, $f30                  \n\t"
+    "psrah      $f2, $f2, $f30                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+
+    "gsldlc1    $f0, 15(%[pSrc])                \n\t"
+    "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
+    "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
+    "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
+    "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
+    "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
+    "gsldrc1    $f0, 8(%[pSrc])                 \n\t"
+    "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "mov.d      $f28, $f8                       \n\t"
+    "mov.d      $f30, $f10                      \n\t"
+    "paddh      $f28, $f28, $f12                \n\t"
+    "paddh      $f30, $f30, $f14                \n\t"
+    "mov.d      $f24, $f16                      \n\t"
+    "mov.d      $f26, $f18                      \n\t"
+    "paddh      $f24, $f24, $f20                \n\t"
+    "paddh      $f26, $f26, $f22                \n\t"
+    "dmfc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f12                        \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "psubh      $f24, $f24, $f28                \n\t"
+    "psubh      $f26, $f26, $f30                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+
+    "dmtc1      $10, $f30                       \n\t"
+    "paddh      $f0, $f0, $f30                  \n\t"
+    "paddh      $f2, $f2, $f30                  \n\t"
+    "dmtc1      $11, $f30                       \n\t"
+    "psrah      $f0, $f0, $f30                  \n\t"
+    "psrah      $f2, $f2, $f30                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+    "gsswlc1    $f0, 0xb(%[pDst])               \n\t"
+    "gsswrc1    $f0, 0x8(%[pDst])               \n\t"
+
+    "dmtc1      $9, $f12                        \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "dli        $9, 0x20                        \n\t"
+    "gsldlc1    $f0, 0x15(%[pSrc])              \n\t"
+    "dmtc1      $9, $f30                        \n\t"
+    "gsldrc1    $f0, 0xE(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+
+    "paddh      $f16, $f16, $f4                 \n\t"
+    "paddh      $f18, $f18, $f6                 \n\t"
+    "paddh      $f20, $f20, $f12                \n\t"
+    "paddh      $f22, $f22, $f14                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "psubh      $f20, $f20, $f16                \n\t"
+    "psubh      $f22, $f22, $f18                \n\t"
+    "paddh      $f8, $f8, $f0                   \n\t"
+    "paddh      $f10, $f10, $f2                 \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+
+    "dmtc1      $10, $f24                       \n\t"
+    "paddh      $f8, $f8, $f24                  \n\t"
+    "paddh      $f10, $f10, $f24                \n\t"
+    "dmtc1      $11, $f24                       \n\t"
+    "psrah      $f8, $f8, $f24                  \n\t"
+    "psrah      $f10, $f10, $f24                \n\t"
+    "packushb   $f8, $f8, $f10                  \n\t"
+    "gssdlc1    $f8, 0x10(%[pDst])              \n\t"
+    "gssdrc1    $f8, 0x9(%[pDst])               \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 2b                  \n\t"
+    "3:                                         \n\t"
+    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    : [iSrcStride]"r"((int)iSrcStride),  [iDstStride]"r"((int)iDstStride)
+    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+static inline void McHorVer20Width5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+                                               uint8_t* pDst, int32_t iDstStride,
+                                               int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 17 || iWidth == 9)
+      McHorVer20Width9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+  else //if (iWidth == 5)
+      McHorVer20Width5_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void McHorVer02Height5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+                           int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "move       $12, %[pSrc]                    \n\t"
+    "move       $13, %[pDst]                    \n\t"
+    "move       $14, %[iHeight]                 \n\t"
+
+    "dsrl       %[iWidth], %[iWidth], 0x2       \n\t"
+    PTR_ADDU   "$10, %[iSrcStride], %[iSrcStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
+
+    "1:                                         \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f4, $f6, $f28, $8)
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f12, $f14, $f28, $8)
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f20, $f22, $f28, $8)
+    FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+    "mov.d      $f0, $f4                        \n\t"
+    "mov.d      $f2, $f6                        \n\t"
+    "mov.d      $f4, $f8                        \n\t"
+    "mov.d      $f6, $f10                       \n\t"
+    "mov.d      $f8, $f12                       \n\t"
+    "mov.d      $f10, $f14                      \n\t"
+    "mov.d      $f12, $f16                      \n\t"
+    "mov.d      $f14, $f18                      \n\t"
+    "mov.d      $f16, $f20                      \n\t"
+    "mov.d      $f18, $f22                      \n\t"
+    "mov.d      $f20, $f24                      \n\t"
+    "mov.d      $f22, $f26                      \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+    "2:                                         \n\t"
+    FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W4($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+                 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f28, $f30, $f0, $8)
+    FILTER_HV_W4($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+                 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W4($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+                 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f4, $f6, $f8, $8)
+    FILTER_HV_W4($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
+                 $f8, $f10, $f12, $f14, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W4($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
+                 $f12, $f14, $f16, $f18, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f12, $f14, $f16, $8)
+    FILTER_HV_W4($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
+                 $f16, $f18, $f20, $f22, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W4($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
+                 $f20, $f22, $f24, $f26, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f20, $f22, $f24, $8)
+    "j          2b                              \n\t"
+
+    "3:                                         \n\t"
+    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
+    "beqz       %[iWidth], 4f                   \n\t"
+    "move       %[pSrc], $12                    \n\t"
+    "move       %[pDst], $13                    \n\t"
+    "move       %[iHeight], $14                 \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], 0x4           \n\t"
+    PTR_ADDIU  "%[pDst], %[pDst], 0x4           \n\t"
+    "j          1b                              \n\t"
+    "4:                                         \n\t"
+    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+      [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void McHorVer02Height9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+                               int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "move       $12, %[pSrc]                    \n\t"
+    "move       $13, %[pDst]                    \n\t"
+    "move       $14, %[iHeight]                 \n\t"
+
+    "dsrl       %[iWidth], %[iWidth], 0x3       \n\t"
+    PTR_ADDU   "$10, %[iSrcStride], %[iSrcStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
+
+    "1:                                         \n\t"
+    "dli        $8, 0x20                        \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+
+    MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f4, $f6, $f28, $8)
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f12, $f14, $f28, $8)
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f20, $f22, $f28, $8)
+    FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+    "mov.d      $f0, $f4                        \n\t"
+    "mov.d      $f2, $f6                        \n\t"
+    "mov.d      $f4, $f8                        \n\t"
+    "mov.d      $f6, $f10                       \n\t"
+    "mov.d      $f8, $f12                       \n\t"
+    "mov.d      $f10, $f14                      \n\t"
+    "mov.d      $f12, $f16                      \n\t"
+    "mov.d      $f14, $f18                      \n\t"
+    "mov.d      $f16, $f20                      \n\t"
+    "mov.d      $f18, $f22                      \n\t"
+    "mov.d      $f20, $f24                      \n\t"
+    "mov.d      $f22, $f26                      \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+    "2:                                         \n\t"
+    FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+    "dmtc1      $9, $f8                         \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+                 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f28, $f30, $f0, $8)
+    FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+                 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
+    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+                 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f4, $f6, $f8, $8)
+    FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
+                 $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
+    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
+                 $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f12, $f14, $f16, $8)
+    FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                 $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], $10           \n\t"
+    MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
+    PTR_ADDU   "%[pDst],  %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
+                 $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 3f                  \n\t"
+
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f20, $f22, $f24, $8)
+    "j          2b                              \n\t"
+
+    "3:                                         \n\t"
+    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
+    "beqz       %[iWidth], 4f                   \n\t"
+
+    "move       %[pSrc], $12                    \n\t"
+    "move       %[pDst], $13                    \n\t"
+    "move       %[iHeight], $14                 \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], $10           \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], 0x8           \n\t"
+    PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
+    "j          1b                              \n\t"
+    "4:                                         \n\t"
+    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+      [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02Height5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+                                                uint8_t* pDst, int32_t iDstStride,
+                                                int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16 || iWidth == 8)
+    McHorVer02Height9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight );
+  else
+    McHorVer02Height5_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+static inline void McHorVer22HorFirst_mmi(const uint8_t *pSrc, int32_t iSrcStride,
+                                          uint8_t * pTap, int32_t iTapStride,
+                                          int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "dli        $8, 0x9                         \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "bne        %[iWidth], $8, 2f               \n\t"
+
+    "1:                                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "mov.d      $f28, $f8                       \n\t"
+    "mov.d      $f30, $f10                      \n\t"
+    "paddh      $f28, $f28, $f12                \n\t"
+    "paddh      $f30, $f30, $f14                \n\t"
+    "mov.d      $f24, $f16                      \n\t"
+    "mov.d      $f26, $f18                      \n\t"
+    "paddh      $f24, $f24, $f20                \n\t"
+    "paddh      $f26, $f26, $f22                \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmfc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f12                        \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "psubh      $f24, $f24, $f28                \n\t"
+    "psubh      $f26, $f26, $f30                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "gsswlc1    $f0, 0x3(%[pTap])               \n\t"
+    "gsswrc1    $f0, 0x0(%[pTap])               \n\t"
+
+    "gsldlc1    $f0, 0xd(%[pSrc])               \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f0, 0x6(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmtc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+
+    "paddh      $f16, $f16, $f4                 \n\t"
+    "paddh      $f18, $f18, $f6                 \n\t"
+    "paddh      $f20, $f20, $f12                \n\t"
+    "paddh      $f22, $f22, $f14                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "psubh      $f20, $f20, $f16                \n\t"
+    "psubh      $f22, $f22, $f18                \n\t"
+    "paddh      $f8, $f8, $f0                   \n\t"
+    "paddh      $f10, $f10, $f2                 \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "gssdlc1    $f8, 0x9(%[pTap])               \n\t"
+    "gssdlc1    $f10, 0x11(%[pTap])             \n\t"
+    "gssdrc1    $f8, 0x2(%[pTap])               \n\t"
+    "gssdrc1    $f10, 0xa(%[pTap])              \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], %[iTapStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    "j          3f                              \n\t"
+
+    "2:                                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "dmtc1      $8, $f30                        \n\t"
+    "paddh      $f8, $f8, $f12                  \n\t"
+    "paddh      $f10, $f10, $f14                \n\t"
+    "paddh      $f16, $f16, $f20                \n\t"
+    "paddh      $f18, $f18, $f22                \n\t"
+    "psllh      $f16, $f16, $f30                \n\t"
+    "psllh      $f18, $f18, $f30                \n\t"
+    "psubh      $f16, $f16, $f8                 \n\t"
+    "psubh      $f18, $f18, $f10                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "psllh      $f16, $f16, $f30                \n\t"
+    "psllh      $f18, $f18, $f30                \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
+
+    "gsldlc1    $f0, 15(%[pSrc])                \n\t"
+    "gsldrc1    $f0, 8(%[pSrc])                 \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+
+    "mov.d      $f28, $f8                       \n\t"
+    "mov.d      $f30, $f10                      \n\t"
+    "paddh      $f28, $f28, $f12                \n\t"
+    "paddh      $f30, $f30, $f14                \n\t"
+    "mov.d      $f24, $f16                      \n\t"
+    "mov.d      $f26, $f18                      \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "paddh      $f24, $f24, $f20                \n\t"
+    "paddh      $f26, $f26, $f22                \n\t"
+    "dmfc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f12                        \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "psubh      $f24, $f24, $f28                \n\t"
+    "psubh      $f26, $f26, $f30                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "psllh      $f24, $f24, $f12                \n\t"
+    "psllh      $f26, $f26, $f12                \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f26                  \n\t"
+    "gsswlc1    $f0, 0x13(%[pTap])              \n\t"
+    "gsswrc1    $f0, 0x10(%[pTap])              \n\t"
+
+    "gsldlc1    $f0, 0x15(%[pSrc])              \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f0, 0xE(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmtc1      $9, $f12                        \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+
+    "paddh      $f16, $f16, $f4                 \n\t"
+    "paddh      $f18, $f18, $f6                 \n\t"
+    "paddh      $f20, $f20, $f12                \n\t"
+    "paddh      $f22, $f22, $f14                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "psubh      $f20, $f20, $f16                \n\t"
+    "psubh      $f22, $f22, $f18                \n\t"
+    "paddh      $f8, $f8, $f0                   \n\t"
+    "paddh      $f10, $f10, $f2                 \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "psllh      $f20, $f20, $f24                \n\t"
+    "psllh      $f22, $f22, $f24                \n\t"
+    "paddh      $f8, $f8, $f20                  \n\t"
+    "paddh      $f10, $f10, $f22                \n\t"
+    "gssdlc1    $f8, 0x19(%[pTap])              \n\t"
+    "gssdlc1    $f10, 0x21(%[pTap])             \n\t"
+    "gssdrc1    $f8, 0x12(%[pTap])              \n\t"
+    "gssdrc1    $f10, 0x1a(%[pTap])             \n\t"
+
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], %[iTapStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 2b                  \n\t"
+    "3:                                         \n\t"
+    : [pSrc]"+&r"(pSrc), [pTap]"+&r"(pTap), [iWidth]"+&r"(iWidth),
+      [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride),  [iTapStride]"r"(iTapStride)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+static inline void McHorVer22Width8VerLastAlign_mmi(const uint8_t *pTap,
+                   int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
+                   int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "move       $10, %[pTap]                    \n\t"
+    "move       $11, %[pDst]                    \n\t"
+    "move       $12, %[iHeight]                 \n\t"
+    "dsrl       %[iWidth], 0x3                  \n\t"
+    PTR_ADDU   "$13, %[iTapStride], %[iTapStride] \n\t"
+    PTR_ADDU   "$14, %[iDstStride], %[iDstStride] \n\t"
+    "dli        $15, 0x0020002000200020         \n\t"
+
+    "4:                                         \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f6, $f4, 0x0($8)               \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[pTap])         \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f14, $f12, 0x0($8)             \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f18, $f16, 0x0(%[pTap])        \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f22, $f20, 0x0($8)             \n\t"
+
+    FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                     $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
+
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[pTap])        \n\t"
+    "mov.d      $f0, $f4                        \n\t"
+    "mov.d      $f2, $f6                        \n\t"
+    "mov.d      $f4, $f8                        \n\t"
+    "mov.d      $f6, $f10                       \n\t"
+    "mov.d      $f8, $f12                       \n\t"
+    "mov.d      $f10, $f14                      \n\t"
+    "mov.d      $f12, $f16                      \n\t"
+    "mov.d      $f14, $f18                      \n\t"
+    "mov.d      $f16, $f20                      \n\t"
+    "mov.d      $f18, $f22                      \n\t"
+    "mov.d      $f20, $f24                      \n\t"
+    "mov.d      $f22, $f26                      \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_SUBU   "%[pTap], %[pTap], %[iTapStride] \n\t"
+
+    "5:                                         \n\t"
+    FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                     $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[pTap])        \n\t"
+
+    FILTER_VER_ALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+                     $f26, $f28, $f30, $f0, $f2, %[pDst], %[iDstStride], $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f30, $f28, 0x0($8)             \n\t"
+
+    FILTER_VER_ALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+                     $f30, $f0, $f2, $f4, $f6, %[pDst], $0, $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pTap])          \n\t"
+
+    FILTER_VER_ALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+                     $f2, $f4, $f6, $f8, $f10, %[pDst], %[iDstStride], $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f6, $f4, 0x0($8)               \n\t"
+
+    FILTER_VER_ALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
+                     $f6, $f8, $f10, $f12, $f14, %[pDst], $0, $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[pTap])         \n\t"
+
+    FILTER_VER_ALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
+                     $f10, $f12, $f14, $f16, $f18, %[pDst], %[iDstStride], $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f14, $f12, 0x0($8)             \n\t"
+
+    FILTER_VER_ALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                     $f14, $f16, $f18, $f20, $f22, %[pDst], $0, $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gslqc1     $f18, $f16, 0x0(%[pTap])        \n\t"
+
+    FILTER_VER_ALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
+                     $f18, $f20, $f22, $f24, $f26, %[pDst], %[iDstStride], $8, $9, $15)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], $14           \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gslqc1     $f22, $f20, 0x0($8)             \n\t"
+    "j          5b                              \n\t"
+
+    "6:                                         \n\t"
+    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
+    "beqz       %[iWidth], 7f                   \n\t"
+    "move       %[pTap], $10                    \n\t"
+    "move       %[pDst], $11                    \n\t"
+    "move       %[iHeight], $12                 \n\t"
+    PTR_ADDIU  "%[pTap], %[pTap], 0x10          \n\t"
+    PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
+    "j          4b                              \n\t"
+    "7:                                         \n\t"
+    : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+      "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18",
+      "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+static inline void McHorVer22Width8VerLastUnAlign_mmi(const uint8_t *pTap,
+                   int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
+                   int32_t iWidth, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "move       $10, %[pTap]                    \n\t"
+    "move       $11, %[pDst]                    \n\t"
+    "move       $12, %[iHeight]                 \n\t"
+    "dsrl       %[iWidth], 0x3                  \n\t"
+    PTR_ADDU   "$13, %[iTapStride], %[iTapStride] \n\t"
+    "dli        $14, 0x0020002000200020         \n\t"
+
+    "4:                                         \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f0, 0x7(%[pTap])               \n\t"
+    "gsldlc1    $f2, 0xF(%[pTap])               \n\t"
+    "gsldlc1    $f4, 0x7($8)                    \n\t"
+    "gsldlc1    $f6, 0xF($8)                    \n\t"
+    "gsldrc1    $f0, 0x0(%[pTap])               \n\t"
+    "gsldrc1    $f2, 0x8(%[pTap])               \n\t"
+    "gsldrc1    $f4, 0x0($8)                    \n\t"
+    "gsldrc1    $f6, 0x8($8)                    \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f8, 0x7(%[pTap])               \n\t"
+    "gsldlc1    $f10, 0xF(%[pTap])              \n\t"
+    "gsldlc1    $f12, 0x7($8)                   \n\t"
+    "gsldlc1    $f14, 0xF($8)                   \n\t"
+    "gsldrc1    $f8, 0x0(%[pTap])               \n\t"
+    "gsldrc1    $f10, 0x8(%[pTap])              \n\t"
+    "gsldrc1    $f12, 0x0($8)                   \n\t"
+    "gsldrc1    $f14, 0x8($8)                   \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f16, 0x7(%[pTap])              \n\t"
+    "gsldlc1    $f18, 0xF(%[pTap])              \n\t"
+    "gsldlc1    $f20, 0x7($8)                   \n\t"
+    "gsldlc1    $f22, 0xF($8)                   \n\t"
+    "gsldrc1    $f16, 0x0(%[pTap])              \n\t"
+    "gsldrc1    $f18, 0x8(%[pTap])              \n\t"
+    "gsldrc1    $f20, 0x0($8)                   \n\t"
+    "gsldrc1    $f22, 0x8($8)                   \n\t"
+
+    FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
+                       $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gsldlc1    $f24, 0x7(%[pTap])              \n\t"
+    "gsldlc1    $f26, 0xF(%[pTap])              \n\t"
+    "gsldrc1    $f24, 0x0(%[pTap])              \n\t"
+    "gsldrc1    $f26, 0x8(%[pTap])              \n\t"
+    "mov.d      $f0, $f4                        \n\t"
+    "mov.d      $f2, $f6                        \n\t"
+    "mov.d      $f4, $f8                        \n\t"
+    "mov.d      $f6, $f10                       \n\t"
+    "mov.d      $f8, $f12                       \n\t"
+    "mov.d      $f10, $f14                      \n\t"
+    "mov.d      $f12, $f16                      \n\t"
+    "mov.d      $f14, $f18                      \n\t"
+    "mov.d      $f16, $f20                      \n\t"
+    "mov.d      $f18, $f22                      \n\t"
+    "mov.d      $f20, $f24                      \n\t"
+    "mov.d      $f22, $f26                      \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_SUBU   "%[pTap], %[pTap], %[iTapStride] \n\t"
+
+    "5:                                         \n\t"
+    FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
+                       $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
+
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gsldlc1    $f24, 0x7(%[pTap])              \n\t"
+    "gsldlc1    $f26, 0xF(%[pTap])              \n\t"
+    "gsldrc1    $f24, 0x0(%[pTap])              \n\t"
+    "gsldrc1    $f26, 0x8(%[pTap])              \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+    FILTER_VER_UNALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22,
+                       $f24, $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f28, 0x7($8)                   \n\t"
+    "gsldlc1    $f30, 0xF($8)                   \n\t"
+    "gsldrc1    $f28, 0x0($8)                   \n\t"
+    "gsldrc1    $f30, 0x8($8)                   \n\t"
+
+    FILTER_VER_UNALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+                       $f28, $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gsldlc1    $f0, 0x7(%[pTap])               \n\t"
+    "gsldlc1    $f2, 0xF(%[pTap])               \n\t"
+    "gsldrc1    $f0, 0x0(%[pTap])               \n\t"
+    "gsldrc1    $f2, 0x8(%[pTap])               \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+    FILTER_VER_UNALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+                       $f30, $f0, $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f4, 0x7($8)                    \n\t"
+    "gsldlc1    $f6, 0xF($8)                    \n\t"
+    "gsldrc1    $f4, 0x0($8)                    \n\t"
+    "gsldrc1    $f6, 0x8($8)                    \n\t"
+
+    FILTER_VER_UNALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2,
+                       $f4, $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gsldlc1    $f8, 0x7(%[pTap])               \n\t"
+    "gsldlc1    $f10, 0xF(%[pTap])              \n\t"
+    "gsldrc1    $f8, 0x0(%[pTap])               \n\t"
+    "gsldrc1    $f10, 0x8(%[pTap])              \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+    FILTER_VER_UNALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
+                       $f8, $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f12, 0x7($8)                   \n\t"
+    "gsldlc1    $f14, 0xF($8)                   \n\t"
+    "gsldrc1    $f12, 0x0($8)                   \n\t"
+    "gsldrc1    $f14, 0x8($8)                   \n\t"
+
+    FILTER_VER_UNALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
+                       $f12, $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pTap], %[pTap], $13           \n\t"
+    "gsldlc1    $f16, 0x7(%[pTap])              \n\t"
+    "gsldlc1    $f18, 0xF(%[pTap])              \n\t"
+    "gsldrc1    $f16, 0x0(%[pTap])              \n\t"
+    "gsldrc1    $f18, 0x8(%[pTap])              \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+    FILTER_VER_UNALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
+                       $f16, $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9, $14)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 6f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pTap], %[iTapStride]      \n\t"
+    "gsldlc1    $f20, 0x7($8)                   \n\t"
+    "gsldlc1    $f22, 0xF($8)                   \n\t"
+    "gsldrc1    $f20, 0x0($8)                   \n\t"
+    "gsldrc1    $f22, 0x8($8)                   \n\t"
+    "j          5b                              \n\t"
+
+    "6:                                         \n\t"
+    PTR_ADDIU  "%[iWidth], %[iWidth], -0x1      \n\t"
+    "beqz       %[iWidth], 7f                   \n\t"
+    "move       %[pTap], $10                    \n\t"
+    "move       %[pDst], $11                    \n\t"
+    "move       %[iHeight], $12                 \n\t"
+    PTR_ADDIU  "%[pTap], %[pTap], 0x10          \n\t"
+    PTR_ADDIU  "%[pDst], %[pDst], 0x8           \n\t"
+    "j          4b                              \n\t"
+
+    "7:                                         \n\t"
+    : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t* pSrc,
+                   int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                   int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
+
+  if (iWidth == 17 || iWidth == 9){
+    int32_t tmp1 = 2 * (iWidth - 8);
+    McHorVer22HorFirst_mmi(pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+
+    McHorVer22Width8VerLastAlign_mmi((uint8_t*)pTap,  48, pDst, iDstStride, iWidth - 1, iHeight);
+
+    McHorVer22Width8VerLastUnAlign_mmi((uint8_t*)pTap + tmp1,  48, pDst + iWidth - 8,
+                                        iDstStride, 8, iHeight);
+  } else {
+    int16_t iTmp[17 + 5];
+    int32_t i, j, k;
+
+    for (i = 0; i < iHeight; i++) {
+      for (j = 0; j < iWidth + 5; j++) {
+        iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
+      }
+      for (k = 0; k < iWidth; k++) {
+        pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
+      }
+      pSrc += iSrcStride;
+      pDst += iDstStride;
+    }
+  }
+}
+
+void McCopyWidthEq4_mmi(const uint8_t *pSrc, int iSrcStride,
+                        uint8_t *pDst, int iDstStride, int iHeight) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "1:                                         \n\t"
+    "lwl        $8, 0x3(%[pSrc])                \n\t"
+    "lwr        $8, 0x0(%[pSrc])                \n\t"
+    "swl        $8, 0x3(%[pDst])                \n\t"
+    "swr        $8, 0x0(%[pDst])                \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8"
+  );
+}
+
+void McCopyWidthEq8_mmi(const uint8_t *pSrc, int iSrcStride,
+                        uint8_t *pDst, int iDstStride, int iHeight) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "1:                                         \n\t"
+    "ldl        $8, 0x7(%[pSrc])                \n\t"
+    "ldr        $8, 0x0(%[pSrc])                \n\t"
+    "sdl        $8, 0x7(%[pDst])                \n\t"
+    "sdr        $8, 0x0(%[pDst])                \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8"
+  );
+}
+
+void McCopyWidthEq16_mmi(const uint8_t *pSrc, int iSrcStride,
+                         uint8_t *pDst, int iDstStride, int iHeight) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "1:                                         \n\t"
+    "ldl        $8, 0x7(%[pSrc])                \n\t"
+    "ldl        $9, 0xF(%[pSrc])                \n\t"
+    "ldr        $8, 0x0(%[pSrc])                \n\t"
+    "ldr        $9, 0x8(%[pSrc])                \n\t"
+    "sdl        $8, 0x7(%[pDst])                \n\t"
+    "sdl        $9, 0xF(%[pDst])                \n\t"
+    "sdr        $8, 0x0(%[pDst])                \n\t"
+    "sdr        $9, 0x8(%[pDst])                \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$9"
+  );
+}
+
+static inline void McCopy_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                              int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McCopyWidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McCopyWidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 4)
+    McCopyWidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McChromaWidthEq4_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+                          int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gsldlc1    $f6, 0x7(%[pABCD])              \n\t"
+    "gsldrc1    $f6, 0x0(%[pABCD])              \n\t"
+    "xor        $f14, $f14, $f14                \n\t"
+    "punpcklbh  $f6, $f6, $f6                   \n\t"
+    "mov.d      $f8, $f6                        \n\t"
+    "punpcklhw  $f6, $f6, $f6                   \n\t"
+    "punpckhhw  $f8, $f8, $f8                   \n\t"
+    "mov.d      $f10, $f6                       \n\t"
+    "punpcklbh  $f6, $f6, $f14                  \n\t"
+    "punpckhbh  $f10, $f10, $f14                \n\t"
+
+    "mov.d      $f12, $f8                       \n\t"
+    "punpcklbh  $f8, $f8, $f14                  \n\t"
+    "punpckhbh  $f12, $f12, $f14                \n\t"
+    PTR_ADDU   "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
+    "dli        $8, 0x6                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f2, 0x8(%[pSrc])               \n\t"
+    "dmtc1      $8, $f16                        \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x1(%[pSrc])               \n\t"
+    "dli        $8, 0x0020002000200020          \n\t"
+    "punpcklbh  $f0, $f0, $f14                  \n\t"
+    "punpcklbh  $f2, $f2, $f14                  \n\t"
+
+    "dmtc1      $8, $f18                        \n\t"
+    "1:                                         \n\t"
+    "pmullh     $f0, $f0, $f6                   \n\t"
+    "pmullh     $f2, $f2, $f10                  \n\t"
+    "paddh      $f0, $f0, $f2                   \n\t"
+
+    "gsldlc1    $f2, 0x7(%[pABCD])              \n\t"
+    "gsldrc1    $f2, 0x0(%[pABCD])              \n\t"
+    "punpcklbh  $f2, $f2, $f14                  \n\t"
+    "mov.d      $f4, $f2                        \n\t"
+    "pmullh     $f2, $f2, $f8                   \n\t"
+    "paddh      $f0, $f0, $f2                   \n\t"
+    "gsldlc1    $f2, 0x8(%[pABCD])              \n\t"
+    "gsldrc1    $f2, 0x1(%[pABCD])              \n\t"
+    "punpcklbh  $f2, $f2, $f14                  \n\t"
+    "mov.d      $f14, $f2                       \n\t"
+    "pmullh     $f2, $f2, $f12                  \n\t"
+    "paddh      $f0, $f0, $f2                   \n\t"
+    "mov.d      $f2, $f14                       \n\t"
+    "paddh      $f0, $f0, $f18                  \n\t"
+    "psrlh      $f0, $f0, $f16                  \n\t"
+    "xor        $f14, $f14, $f14                \n\t"
+    "packushb   $f0, $f0, $f14                  \n\t"
+    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
+    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
+    "mov.d      $f0, $f4                        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+      [pABCD]"+&r"((unsigned char *)pABCD), [iHeight]"+&r"((int)iHeight)
+    : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18"
+  );
+}
+
+void McChromaWidthEq8_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+                          int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gsldlc1    $f12, 0x7(%[pABCD])             \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldrc1    $f12, 0x0(%[pABCD])             \n\t"
+    "punpcklbh  $f12, $f12, $f12                \n\t"
+    "punpckhhw  $f14, $f12, $f12                \n\t"
+    "punpcklhw  $f12, $f12, $f12                \n\t"
+
+    "mov.d      $f16, $f14                      \n\t"
+    "punpckhwd  $f14, $f12, $f12                \n\t"
+    "punpcklwd  $f12, $f12, $f12                \n\t"
+    "punpckhwd  $f18, $f16, $f16                \n\t"
+    "punpcklwd  $f16, $f16, $f16                \n\t"
+    "mov.d      $f20, $f14                      \n\t"
+    "mov.d      $f24, $f18                      \n\t"
+
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpckhbh  $f26, $f24, $f28                \n\t"
+    "punpcklbh  $f24, $f24, $f28                \n\t"
+
+    PTR_ADDU   "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0x8(%[pSrc])               \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x1(%[pSrc])               \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "1:                                         \n\t"
+    "dli        $8, 0x20                        \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+
+    "pmullh     $f0, $f0, $f12                  \n\t"
+    "pmullh     $f2, $f2, $f14                  \n\t"
+    "pmullh     $f4, $f4, $f20                  \n\t"
+    "pmullh     $f6, $f6, $f22                  \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+
+    "gsldlc1    $f4, 0x7(%[pABCD])              \n\t"
+    "gsldrc1    $f4, 0x0(%[pABCD])              \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "mov.d      $f8, $f4                        \n\t"
+    "mov.d      $f10, $f6                       \n\t"
+    "pmullh     $f4, $f4, $f16                  \n\t"
+    "pmullh     $f6, $f6, $f18                  \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+
+    "gsldlc1    $f4, 0x8(%[pABCD])              \n\t"
+    "gsldrc1    $f4, 0x1(%[pABCD])              \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "mov.d      $f28, $f4                       \n\t"
+    "mov.d      $f30, $f6                       \n\t"
+    "pmullh     $f4, $f4, $f24                  \n\t"
+    "pmullh     $f6, $f6, $f26                  \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "mov.d      $f4, $f28                       \n\t"
+    "mov.d      $f6, $f30                       \n\t"
+
+    "dli        $8, 0x0020002000200020          \n\t"
+    "dmfc1      $9, $f20                        \n\t"
+    "dmtc1      $8, $f20                        \n\t"
+    "dli        $8, 0x6                         \n\t"
+    "paddh      $f0, $f0, $f20                  \n\t"
+    "paddh      $f2, $f2, $f20                  \n\t"
+    "dmtc1      $8, $f20                        \n\t"
+    "psrlh      $f0, $f0, $f20                  \n\t"
+    "psrlh      $f2, $f2, $f20                  \n\t"
+
+    "xor        $f28, $f28, $f28                \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+
+    "mov.d      $f0, $f8                        \n\t"
+    "mov.d      $f2, $f10                       \n\t"
+    "dmtc1      $9, $f20                        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
+
+    PTR_ADDIU  "%[iHeight], %[iHeight], -1      \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [pABCD]"+&r"(pABCD),
+      [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void McChroma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                  int32_t iDstStride, int16_t iMvX, int16_t iMvY,
+                  int32_t iWidth, int32_t iHeight) {
+  static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
+    McChromaWidthEq4_mmi,
+    McChromaWidthEq8_mmi
+  };
+  const int32_t kiD8x = iMvX & 0x07;
+  const int32_t kiD8y = iMvY & 0x07;
+  if (kiD8x == 0 && kiD8y == 0) {
+    McCopy_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+    return;
+  }
+  if (iWidth != 2) {
+    kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
+                                      g_kuiABCD[kiD8y][kiD8x], iHeight);
+  } else
+    McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
+                          iWidth, iHeight);
+}
+
+void McHorVer20WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+                            int iDstStride, int iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "dli        $8, 0x0010001000100010          \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmtc1      $8, $f26                        \n\t"
+    "dli        $8, 0x5                         \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+    "1:                                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+    "paddh      $f8, $f8, $f12                  \n\t"
+    "paddh      $f10, $f10, $f14                \n\t"
+    "paddh      $f16, $f16, $f20                \n\t"
+    "paddh      $f18, $f18, $f22                \n\t"
+    "psllh      $f16, $f16, $f26                \n\t"
+    "psllh      $f18, $f18, $f26                \n\t"
+    "psubh      $f16, $f16, $f8                 \n\t"
+    "psubh      $f18, $f18, $f10                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "psllh      $f16, $f16, $f26                \n\t"
+    "psllh      $f18, $f18, $f26                \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f24                  \n\t"
+    "psrah      $f0, $f0, $f30                  \n\t"
+    "psrah      $f2, $f2, $f30                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void McHorVer20WidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+                             int iDstStride, int iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
+    "dli        $8, 0x0010001000100010          \n\t"
+    "dmtc1      $8, $f24                        \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmtc1      $8, $f26                        \n\t"
+    "dli        $8, 0x5                         \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+    "1:                                         \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+    "paddh      $f8, $f8, $f12                  \n\t"
+    "paddh      $f10, $f10, $f14                \n\t"
+    "paddh      $f16, $f16, $f20                \n\t"
+    "paddh      $f18, $f18, $f22                \n\t"
+    "psllh      $f16, $f16, $f26                \n\t"
+    "psllh      $f18, $f18, $f26                \n\t"
+    "psubh      $f16, $f16, $f8                 \n\t"
+    "psubh      $f18, $f18, $f10                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "psllh      $f16, $f16, $f26                \n\t"
+    "psllh      $f18, $f18, $f26                \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f24                  \n\t"
+    "psrah      $f0, $f0, $f30                  \n\t"
+    "psrah      $f2, $f2, $f30                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0x14(%[pSrc])              \n\t"
+    "gsldlc1    $f8, 0x10(%[pSrc])              \n\t"
+    "gsldlc1    $f12, 0x13(%[pSrc])             \n\t"
+    "gsldlc1    $f16, 0x11(%[pSrc])             \n\t"
+    "gsldlc1    $f20, 0x12(%[pSrc])             \n\t"
+    "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0xd(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x9(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0xc(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0xb(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+    "paddh      $f8, $f8, $f12                  \n\t"
+    "paddh      $f10, $f10, $f14                \n\t"
+    "paddh      $f16, $f16, $f20                \n\t"
+    "paddh      $f18, $f18, $f22                \n\t"
+    "psllh      $f16, $f16, $f26                \n\t"
+    "psllh      $f18, $f18, $f26                \n\t"
+    "psubh      $f16, $f16, $f8                 \n\t"
+    "psubh      $f18, $f18, $f10                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "psllh      $f16, $f16, $f26                \n\t"
+    "psllh      $f18, $f18, $f26                \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "paddh      $f0, $f0, $f24                  \n\t"
+    "paddh      $f2, $f2, $f24                  \n\t"
+    "psrah      $f0, $f0, $f30                  \n\t"
+    "psrah      $f2, $f2, $f30                  \n\t"
+    "packushb   $f0, $f0, $f2                   \n\t"
+    "gssdlc1    $f0, 0xF(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x8(%[pDst])               \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void McHorVer20WidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+                            int iDstStride, int iHeight) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "1:                                         \n\t"
+    PTR_ADDIU  "%[pSrc], %[pSrc], -0x2          \n\t"
+    "xor        $f14, $f14, $f14                \n\t"
+    "dli        $8, 0x0010001000100010          \n\t"
+    "dmtc1      $8, $f12                        \n\t"
+    "1:                                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f2, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f6, 0xb(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x9(%[pSrc])               \n\t"
+    "gsldlc1    $f10, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x4(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x2(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x3(%[pSrc])              \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "punpcklbh  $f0, $f0, $f14                  \n\t"
+    "punpcklbh  $f2, $f2, $f14                  \n\t"
+    "punpcklbh  $f4, $f4, $f14                  \n\t"
+    "punpcklbh  $f6, $f6, $f14                  \n\t"
+    "punpcklbh  $f8, $f8, $f14                  \n\t"
+    "punpcklbh  $f10, $f10, $f14                \n\t"
+    "dmtc1      $8, $f16                        \n\t"
+    "paddh      $f4, $f4, $f6                   \n\t"
+    "paddh      $f8, $f8, $f10                  \n\t"
+    "psllh      $f8, $f8, $f16                  \n\t"
+    "psubh      $f8, $f8, $f4                   \n\t"
+    "paddh      $f0, $f0, $f2                   \n\t"
+    "paddh      $f0, $f0, $f8                   \n\t"
+    "dli        $8, 0x5                         \n\t"
+    "psllh      $f8, $f8, $f16                  \n\t"
+    "paddh      $f0, $f0, $f8                   \n\t"
+    "paddh      $f0, $f0, $f12                  \n\t"
+    "dmtc1      $8, $f16                        \n\t"
+    "psrah      $f0, $f0, $f16                  \n\t"
+    "packushb   $f0, $f0, $f14                  \n\t"
+    "gsswlc1    $f0, 0x3(%[pDst])               \n\t"
+    "gsswrc1    $f0, 0x0(%[pDst])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16"
+  );
+}
+
+static inline void McHorVer20_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer02WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+                            int iDstStride, int iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f4, $f6, $f28, $8)
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f12, $f14, $f28, $8)
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f20, $f22, $f28, $8)
+
+    "1:                                         \n\t"
+    FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+                 $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+                 $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f28, $f30, $f0, $8)
+    FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+                 $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+                 $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f4, $f6, $f8, $8)
+    FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
+                 $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
+                 $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f12, $f14, $f16, $8)
+    FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                 $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
+                 $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "beqz       %[iHeight], 2f                  \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iSrcStride]      \n\t"
+    MMI_LOAD_8P($f20, $f22, $f24, $8)
+    "j          1b                              \n\t"
+    "2:                                         \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+static inline void McHorVer02WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+                   uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  McHorVer02WidthEq8_mmi (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer02WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McHorVer02_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+                   uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                   int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+void McHorVer22Width8HorFirst_mmi(const uint8_t *pSrc, int16_t iSrcStride,
+     uint8_t *pDst, int32_t iDstStride, int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_SUBU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "dli        $8, 0x2                         \n\t"
+    "dmtc1      $8, $f30                        \n\t"
+    "1:                                         \n\t"
+    "xor        $f28, $f28, $f28                \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xc(%[pSrc])               \n\t"
+    "gsldlc1    $f8, 0x8(%[pSrc])               \n\t"
+    "gsldlc1    $f12, 0xb(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0x9(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xa(%[pSrc])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x5(%[pSrc])               \n\t"
+    "gsldrc1    $f8, 0x1(%[pSrc])               \n\t"
+    "gsldrc1    $f12, 0x4(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x2(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x3(%[pSrc])              \n\t"
+    "punpckhbh  $f2, $f0, $f28                  \n\t"
+    "punpckhbh  $f6, $f4, $f28                  \n\t"
+    "punpckhbh  $f10, $f8, $f28                 \n\t"
+    "punpckhbh  $f14, $f12, $f28                \n\t"
+    "punpckhbh  $f18, $f16, $f28                \n\t"
+    "punpckhbh  $f22, $f20, $f28                \n\t"
+    "punpcklbh  $f0, $f0, $f28                  \n\t"
+    "punpcklbh  $f4, $f4, $f28                  \n\t"
+    "punpcklbh  $f8, $f8, $f28                  \n\t"
+    "punpcklbh  $f12, $f12, $f28                \n\t"
+    "punpcklbh  $f16, $f16, $f28                \n\t"
+    "punpcklbh  $f20, $f20, $f28                \n\t"
+    "paddh      $f8, $f8, $f12                  \n\t"
+    "paddh      $f10, $f10, $f14                \n\t"
+    "paddh      $f16, $f16, $f20                \n\t"
+    "paddh      $f18, $f18, $f22                \n\t"
+    "psllh      $f16, $f16, $f30                \n\t"
+    "psllh      $f18, $f18, $f30                \n\t"
+    "psubh      $f16, $f16, $f8                 \n\t"
+    "psubh      $f18, $f18, $f10                \n\t"
+    "paddh      $f0, $f0, $f4                   \n\t"
+    "paddh      $f2, $f2, $f6                   \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "psllh      $f16, $f16, $f30                \n\t"
+    "psllh      $f18, $f18, $f30                \n\t"
+    "paddh      $f0, $f0, $f16                  \n\t"
+    "paddh      $f2, $f2, $f18                  \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+    : [iSrcStride]"r"(iSrcStride),  [iDstStride]"r"(iDstStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+static inline void McHorVer22WidthEq8_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+                   uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
+  McHorVer22Width8HorFirst_mmi (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
+  McHorVer22Width8VerLastAlign_mmi ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
+}
+
+static inline void McHorVer22WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+                   uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+  McHorVer22WidthEq8_mmi (pSrc,     iSrcStride, pDst,     iDstStride, iHeight);
+  McHorVer22WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McHorVer22_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                   int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  if (iWidth == 16)
+    McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else if (iWidth == 8)
+    McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+  else
+    McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+void PixelAvgWidthEq4_mmi(uint8_t *pDst,  int iDstStride, const uint8_t *pSrcA,
+     int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                    \n\t"
+    "1:                                            \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrcB])                 \n\t"
+    "gsldlc1    $f2, 0x7(%[pSrcA])                 \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrcB])                 \n\t"
+    "gsldrc1    $f2, 0x0(%[pSrcA])                 \n\t"
+    "pavgb      $f0, $f0, $f2                      \n\t"
+    "gsswlc1    $f0, 0x3(%[pDst])                  \n\t"
+    "gsswrc1    $f0, 0x0(%[pDst])                  \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1       \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride]    \n\t"
+    PTR_ADDU   "%[pSrcA], %[pSrcA], %[iSrcAStride] \n\t"
+    PTR_ADDU   "%[pSrcB], %[pSrcB], %[iSrcBStride] \n\t"
+    "bnez       %[iHeight], 1b                     \n\t"
+    : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
+      [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
+    : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
+      [iSrcBStride]"r"((int)iSrcBStride)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2"
+  );
+}
+
+void PixelAvgWidthEq8_mmi(uint8_t *pDst,  int iDstStride, const uint8_t *pSrcA,
+     int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "1:                                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
+    "gsldlc1    $f2, 0x7(%[pSrcB])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
+    "gsldrc1    $f2, 0x0(%[pSrcB])              \n\t"
+    "pavgb      $f0, $f0, $f2                   \n\t"
+    PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gsldlc1    $f0, 0x7($8)                    \n\t"
+    "gsldlc1    $f2, 0x7($9)                    \n\t"
+    "gsldrc1    $f0, 0x0($8)                    \n\t"
+    "gsldrc1    $f2, 0x0($9)                    \n\t"
+    "pavgb      $f0, $f0, $f2                   \n\t"
+    PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
+    "gssdlc1    $f0, 0x7($10)                   \n\t"
+    PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
+    "gssdrc1    $f0, 0x0($10)                   \n\t"
+    PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
+    PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x2    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
+      [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
+    : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
+      [iSrcBStride]"r"((int)iSrcBStride)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2"
+  );
+}
+
+void PixelAvgWidthEq16_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
+     int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "1:                                         \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
+    "gsldlc1    $f2, 0xF(%[pSrcA])              \n\t"
+    "gsldlc1    $f4, 0x7(%[pSrcB])              \n\t"
+    "gsldlc1    $f6, 0xF(%[pSrcB])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
+    "gsldrc1    $f2, 0x8(%[pSrcA])              \n\t"
+    "gsldrc1    $f4, 0x0(%[pSrcB])              \n\t"
+    "gsldrc1    $f6, 0x8(%[pSrcB])              \n\t"
+    "pavgb      $f0, $f0, $f4                   \n\t"
+    "pavgb      $f2, $f2, $f6                   \n\t"
+    PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
+    PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
+    "gsldlc1    $f0, 0x7($8)                    \n\t"
+    "gsldlc1    $f2, 0xF($8)                    \n\t"
+    "gsldrc1    $f0, 0x0($8)                    \n\t"
+    "gsldrc1    $f2, 0x8($8)                    \n\t"
+    PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
+    "gsldlc1    $f4, 0x7($9)                    \n\t"
+    "gsldlc1    $f6, 0xF($9)                    \n\t"
+    "gsldrc1    $f4, 0x0($9)                    \n\t"
+    "gsldrc1    $f6, 0x8($9)                    \n\t"
+    "pavgb      $f0, $f0, $f4                   \n\t"
+    "pavgb      $f2, $f2, $f6                   \n\t"
+    "gssdlc1    $f0, 0x7($10)                   \n\t"
+    "gssdlc1    $f2, 0xF($10)                   \n\t"
+    "gssdrc1    $f0, 0x0($10)                   \n\t"
+    "gssdrc1    $f2, 0x8($10)                   \n\t"
+
+    PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
+    PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
+    PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrcA])              \n\t"
+    "gsldlc1    $f2, 0xF(%[pSrcA])              \n\t"
+    "gsldlc1    $f4, 0x7(%[pSrcB])              \n\t"
+    "gsldlc1    $f6, 0xF(%[pSrcB])              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrcA])              \n\t"
+    "gsldrc1    $f2, 0x8(%[pSrcA])              \n\t"
+    "gsldrc1    $f4, 0x0(%[pSrcB])              \n\t"
+    "gsldrc1    $f6, 0x8(%[pSrcB])              \n\t"
+    "pavgb      $f0, $f0, $f4                   \n\t"
+    "pavgb      $f2, $f2, $f6                   \n\t"
+    PTR_ADDU   "$8, %[pSrcA], %[iSrcAStride]    \n\t"
+    PTR_ADDU   "$9, %[pSrcB], %[iSrcBStride]    \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f2, 0xF(%[pDst])               \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f2, 0x8(%[pDst])               \n\t"
+    "gsldlc1    $f0, 0x7($8)                    \n\t"
+    "gsldlc1    $f2, 0xF($8)                    \n\t"
+    "gsldlc1    $f4, 0x7($9)                    \n\t"
+    "gsldlc1    $f6, 0xF($9)                    \n\t"
+    "gsldrc1    $f0, 0x0($8)                    \n\t"
+    "gsldrc1    $f2, 0x8($8)                    \n\t"
+    "gsldrc1    $f4, 0x0($9)                    \n\t"
+    "gsldrc1    $f6, 0x8($9)                    \n\t"
+    PTR_ADDU   "$10, %[pDst], %[iDstStride]     \n\t"
+    "pavgb      $f0, $f0, $f4                   \n\t"
+    "pavgb      $f2, $f2, $f6                   \n\t"
+    "gssdlc1    $f0, 0x7($10)                   \n\t"
+    "gssdlc1    $f2, 0xF($10)                   \n\t"
+    "gssdrc1    $f0, 0x0($10)                   \n\t"
+    "gssdrc1    $f2, 0x8($10)                   \n\t"
+    PTR_ADDU   "%[pSrcA], $8, %[iSrcAStride]    \n\t"
+    PTR_ADDU   "%[pSrcB], $9, %[iSrcBStride]    \n\t"
+    PTR_ADDU   "%[pDst], $10, %[iDstStride]     \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x4    \n\t"
+    "bnez       %[iHeight], 1b                  \n\t"
+    : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
+      [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
+    : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
+      [iSrcBStride]"r"((int)iSrcBStride)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6"
+  );
+}
+
+static inline void McHorVer01_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+
+static inline void McHorVer03_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+
+static inline void McHorVer10_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+  }
+}
+
+static inline void McHorVer11_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+
+static inline void McHorVer12_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer13_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_mmi (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_mmi (pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc,            iSrcStride, pVerTmp, 16, 4 , iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer21_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+
+static inline void McHorVer23_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq16_mmi (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22WidthEq8_mmi (pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer22_c (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer30_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer31_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc,   iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer32_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq16_mmi (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+    McHorVer22WidthEq8_mmi (pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  } else {
+    McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+    McHorVer22_c (pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+  }
+}
+static inline void McHorVer33_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+                                  int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+  ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+  if (iWidth == 16) {
+    McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq16_mmi (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else if (iWidth == 8) {
+    McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02WidthEq8_mmi (pSrc + 1,          iSrcStride, pVerTmp, 16, iHeight);
+    PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  } else {
+    McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+    McHorVer02_c (pSrc + 1,          iSrcStride, pVerTmp, 16, 4, iHeight);
+    PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+  }
+}
+
+void McLuma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+  static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+    {McCopy_mmi,     McHorVer01_mmi, McHorVer02_mmi, McHorVer03_mmi},
+    {McHorVer10_mmi, McHorVer11_mmi, McHorVer12_mmi, McHorVer13_mmi},
+    {McHorVer20_mmi, McHorVer21_mmi, McHorVer22_mmi, McHorVer23_mmi},
+    {McHorVer30_mmi, McHorVer31_mmi, McHorVer32_mmi, McHorVer33_mmi},
+  };
+
+  pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void PixelAvg_mmi(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                  const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+  static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
+    PixelAvgWidthEq8_mmi,
+    PixelAvgWidthEq16_mmi
+  };
+  kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+}
+#endif//HAVE_MMI
 } // anon ns.
 
 void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
@@ -1716,4 +4252,15 @@
     pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1
   }
 #endif
+
+#if defined(HAVE_MMI)
+  if (uiCpuFlag & WELS_CPU_MMI) {
+    pMcFuncs->pfLumaHalfpelHor  = McHorVer20Width5Or9Or17_mmi;
+    pMcFuncs->pfLumaHalfpelVer  = McHorVer02Height5Or9Or17_mmi;
+    pMcFuncs->pfLumaHalfpelCen  = McHorVer22Width5Or9Or17Height5Or9Or17_mmi;
+    pMcFuncs->pfSampleAveraging = PixelAvg_mmi;
+    pMcFuncs->pMcChromaFunc     = McChroma_mmi;
+    pMcFuncs->pMcLumaFunc       = McLuma_mmi;
+  }
+#endif//HAVE_MMI
 }
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -63,6 +63,10 @@
 #endif
 
 
+#if defined(HAVE_MMI)
+void IdctResAddPred_mmi (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+#endif//HAVE_MMI
+
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@@ -166,6 +166,20 @@
 void WelsDecoderIChromaPredPlane_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
 void WelsDecoderIChromaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
 #endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsDecoderI16x16LumaPredDc_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredPlane_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredH_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredV_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDcTop_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDcNA_mmi (uint8_t* pPred, const int32_t kiStride);
+
+void WelsDecoderIChromaPredDcTop_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredPlane_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredDc_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredH_mmi (uint8_t* pPred, const int32_t kiStride);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- /dev/null
+++ b/codec/decoder/core/mips/dct_mmi.c
@@ -1,0 +1,786 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    dct_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    17/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define LOAD_2_LEFT_AND_ADD                                   \
+  PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
+  "lbu        $9, -0x1(%[pPred])                        \n\t" \
+  PTR_ADDU   "$8, $8, $9                                \n\t" \
+  PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
+  "lbu        $9, -0x1(%[pPred])                        \n\t" \
+  PTR_ADDU   "$8, $8, $9                                \n\t"
+
+unsigned char mmi_dc_0x80[16] __attribute__((aligned(16))) = {
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+short mmi_wd_0x02[8] __attribute__((aligned(16))) = {2, 2, 2, 2, 2, 2, 2, 2};
+short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4, -3, -2, -1, 0};
+short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
+short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
+
+short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
+short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
+short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0, 1, 2, 3, 4};
+
+unsigned char mmi_01bytes[16]__attribute__((aligned(16))) = {1, 1, 1, 1, 1, 1, 1, 1,
+                                                             1, 1, 1, 1, 1, 1, 1, 1};
+
+void IdctResAddPred_mmi(uint8_t *pPred, const int32_t kiStride, int16_t *pRs) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dli        $8, 0x1                                   \n\t"
+    "gsldxc1    $f0, 0x0(%[pRs], $0)                      \n\t"
+    "gsldxc1    $f2, 0x8(%[pRs], $0)                      \n\t"
+    "gsldxc1    $f4, 0x10(%[pRs], $0)                     \n\t"
+    "gsldxc1    $f6, 0x18(%[pRs], $0)                     \n\t"
+    "dmtc1      $8, $f14                                  \n\t"
+
+    MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
+    MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f14)
+    MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
+    MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f14)
+
+    "dli        $8, 0x20                                  \n\t"
+    "xor        $f14, $f14, $f14                          \n\t"
+    "dmtc1      $8, $f12                                  \n\t"
+    "pshufh     $f12, $f12, $f14                          \n\t"
+    "dli        $8, 0x6                                   \n\t"
+    "dmtc1      $8, $f16                                  \n\t"
+
+    MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+    : [pPred]"+&r"((unsigned char *)pPred)
+    : [pRs]"r"((unsigned char *)pRs), [kiStride]"r"((int)kiStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16"
+  );
+}
+
+void WelsDecoderI16x16LumaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "dli        $8, 0x5                                   \n\t"
+    "gsldxc1    $f10, 0x0(%[mmi_01bytes], $0)             \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+
+    "move       $10, %[pPred]                             \n\t"
+    PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    "xor        $f4, $f4, $f4                             \n\t"
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f4                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f0, $f0, $f2                             \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $8, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+
+    PTR_ADDIU  "$8, $8, 0x10                              \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+    "paddh      $f0, $f0, $f4                             \n\t"
+    "psrlw      $f0, $f0, $f8                             \n\t"
+    "pmuluw     $f0, $f0, $f10                            \n\t"
+    "punpcklwd  $f0, $f0, $f0                             \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssqc1     $f2, $f0, 0x0($10)                        \n\t"
+    : [pPred] "+&r"((unsigned char *)pPred)
+    : [kiStride] "r"((int)kiStride),
+      [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10"
+  );
+}
+
+void WelsDecoderI16x16LumaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "move       $10, %[pPred]                             \n\t"
+    PTR_ADDIU  "%[pPred], %[pPred], -0x1                  \n\t"
+    PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pPred])                        \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "gsldrc1    $f0, 0x0(%[pPred])                        \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[mmi_plane_dec])         \n\t"
+    "punpckhbh  $f2, $f0, $f28                            \n\t"
+    "punpcklbh  $f0, $f0, $f28                            \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "gsldlc1    $f4, 0x10(%[pPred])                       \n\t"
+    "pmullh     $f2, $f2, $f22                            \n\t"
+    "gsldrc1    $f4, 0x9(%[pPred])                        \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[mmi_plane_inc])         \n\t"
+    "punpckhbh  $f6, $f4, $f28                            \n\t"
+    "punpcklbh  $f4, $f4, $f28                            \n\t"
+    "pmullh     $f4, $f4, $f24                            \n\t"
+    "pmullh     $f6, $f6, $f26                            \n\t"
+    "psubh      $f4, $f4, $f0                             \n\t"
+    "psubh      $f6, $f6, $f2                             \n\t"
+
+    SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+    "dmfc1      $8, $f4                                   \n\t"
+    "seh        $8, $8                                    \n\t"
+    "mul        $8, $8, 0x5                               \n\t"
+    PTR_ADDIU  "$8, $8, 0x20                              \n\t"
+    "sra        $8, $8, 0x6                               \n\t"
+    MMI_Copy8Times($f4, $f6, $f28, $8)
+
+    "lbu        $9, 0x10(%[pPred])                        \n\t"
+    PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
+    LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
+                %[kiStride], $11)
+
+    PTR_ADDIU  "%[pPred], %[pPred], 0x3                   \n\t"
+    "dsll       $11, %[kiStride], 0x3                     \n\t"
+    PTR_ADDU   "$11, $11, %[pPred]                        \n\t"
+    "lbu        $8, 0x0($11)                              \n\t"
+    PTR_ADDU   "$9, $9, $8                                \n\t"
+    "dsll       $9, $9, 0x4                               \n\t"
+
+    PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
+                %[kiStride], $11)
+
+    "xor        $f16, $f16, $f16                          \n\t"
+    "punpcklbh  $f0, $f2, $f16                            \n\t"
+    "punpckhbh  $f2, $f2, $f16                            \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "pmullh     $f2, $f2, $f22                            \n\t"
+    "punpcklbh  $f28, $f30, $f16                          \n\t"
+    "punpckhbh  $f30, $f30, $f16                          \n\t"
+    "pmullh     $f28, $f28, $f24                          \n\t"
+    "pmullh     $f30, $f30, $f26                          \n\t"
+    "psubh      $f28, $f28, $f0                           \n\t"
+    "psubh      $f30, $f30, $f2                           \n\t"
+
+    "xor        $f8, $f8, $f8                             \n\t"
+
+    SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+    "dmfc1      $8, $f28                                  \n\t"
+    "seh        $8, $8                                    \n\t"
+
+    "mul        $8, $8, 0x5                               \n\t"
+    PTR_ADDIU  "$8, $8, 0x20                              \n\t"
+    "sra        $8, $8, 0x6                               \n\t"
+    MMI_Copy8Times($f16, $f18, $f8, $8)
+
+    "move       %[pPred], $10                             \n\t"
+    PTR_ADDIU  "$9, $9, 0x10                              \n\t"
+    "mul        $8, $8, -0x7                              \n\t"
+    PTR_ADDU   "$9, $9, $8                                \n\t"
+    MMI_Copy8Times($f0, $f2, $f8, $9)
+
+    "xor        $8, $8, $8                                \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[mmi_plane_inc_minus])   \n\t"
+
+    "dli        $11, 0x5                                  \n\t"
+    "dmtc1      $11, $f30                                 \n\t"
+    "1:                                                   \n\t"
+    "pmullh     $f8, $f4, $f20                            \n\t"
+    "pmullh     $f10, $f6, $f22                           \n\t"
+    "paddh      $f8, $f8, $f0                             \n\t"
+    "paddh      $f10, $f10, $f2                           \n\t"
+    "psrah      $f8, $f8, $f30                            \n\t"
+    "psrah      $f10, $f10, $f30                          \n\t"
+    "pmullh     $f12, $f4, $f24                           \n\t"
+    "pmullh     $f14, $f6, $f26                           \n\t"
+    "paddh      $f12, $f12, $f0                           \n\t"
+    "paddh      $f14, $f14, $f2                           \n\t"
+    "psrah      $f12, $f12, $f30                          \n\t"
+    "psrah      $f14, $f14, $f30                          \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f12, $f14                          \n\t"
+    "gssqc1     $f10, $f8, 0x0(%[pPred])                  \n\t"
+    "paddh      $f0, $f0, $f16                            \n\t"
+    "paddh      $f2, $f2, $f18                            \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    PTR_ADDIU  "$8, $8, 0x1                               \n\t"
+    PTR_ADDIU  "$11, $8, -0x10                            \n\t"
+    "bnez       $11, 1b                                   \n\t"
+    "nop                                                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred)
+    : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
+      [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
+    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+#define COPY_16_TIMES(r0, f0, f2, f4, f6, f8)                 \
+  "gslqc1     "#f2", "#f0", -0x10("#r0")                \n\t" \
+  "dsrl       "#f0", "#f2", "#f4"                       \n\t" \
+  "pmuluw     "#f0", "#f0", "#f6"                       \n\t" \
+  "punpcklwd  "#f0", "#f0", "#f0"                       \n\t" \
+  "mov.d      "#f2", "#f0"                              \n\t"
+
+#define MMI_PRED_H_16X16_TWO_LINE_DEC                         \
+  PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
+  COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)            \
+  "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t" \
+  PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t" \
+  COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)            \
+  "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+
+void WelsDecoderI16x16LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dli        $8, 56                                    \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+    "gsldxc1    $f6, 0x0(%[mmi_01bytes], $0)              \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+
+    COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    MMI_PRED_H_16X16_TWO_LINE_DEC
+    : [pPred]"+&r"((unsigned char *)pPred)
+    : [kiStride]"r"((int)kiStride),
+      [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+  );
+}
+
+void WelsDecoderI16x16LumaPredV_mmi(uint8_t *pPred, const int32_t kiStride) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    : [pPred] "+&r"((unsigned char *)pPred)
+    : [kiStride] "r"((int)kiStride)
+    : "memory", "$f0", "$f2"
+  );
+}
+
+void WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_SUBU   "$8, %[pPred], %[kiStride]                 \n\t"
+    "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "pasubub    $f0, $f0, $f28                            \n\t"
+    "pasubub    $f2, $f2, $f28                            \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f0, $f0, $f2                             \n\t"
+    "dmfc1      $8, $f0                                   \n\t"
+
+    PTR_ADDIU  "$8, $8, 0x8                               \n\t"
+    "dsra       $8, $8, 0x4                               \n\t"
+    MMI_Copy16Times($f4, $f6, $f28, $8)
+    "mov.d      $f0, $f4                                  \n\t"
+    "mov.d      $f2, $f6                                  \n\t"
+
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred)
+    : [kiStride]"r"((int)kiStride)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+  );
+  RECOVER_REG;
+}
+
+void WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t *pPred, const int32_t kiStride) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[mmi_dc_0x80])             \n\t"
+    "mov.d      $f4, $f0                                  \n\t"
+    "mov.d      $f6, $f2                                  \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[pPred])                   \n\t"
+    : [pPred] "+&r"((unsigned char *)pPred)
+    : [kiStride] "r"((int)kiStride), [mmi_dc_0x80] "r"(mmi_dc_0x80)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+  );
+}
+
+void WelsDecoderIChromaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "move       $10, %[pPred]                             \n\t"
+    PTR_ADDIU  "%[pPred], %[pPred], -0x1                  \n\t"
+    PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pPred])                        \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "gsldrc1    $f0, 0x0(%[pPred])                        \n\t"
+    "gsldxc1    $f20, 0x0(%[mmi_plane_dec_c], $0)         \n\t"
+    "punpcklbh  $f0, $f0, $f28                            \n\t"
+    "gsldlc1    $f4, 0xc(%[pPred])                        \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "gsldrc1    $f4, 0x5(%[pPred])                        \n\t"
+    "gsldxc1    $f24, 0x0(%[mmi_plane_inc_c], $0)         \n\t"
+    "punpcklbh  $f4, $f4, $f28                            \n\t"
+    "pmullh     $f4, $f4, $f24                            \n\t"
+    "psubh      $f4, $f4, $f0                             \n\t"
+
+    "xor        $f6, $f6, $f6                             \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+    "dmfc1      $8, $f4                                   \n\t"
+    "seh        $8, $8                                    \n\t"
+    "mul        $8, $8, 0x11                              \n\t"
+    PTR_ADDIU  "$8, $8, 0x10                              \n\t"
+    "sra        $8, $8, 0x5                               \n\t"
+    MMI_Copy8Times($f4, $f6, $f8, $8)
+
+    "lbu        $9, 0x8(%[pPred])                         \n\t"
+    PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
+    LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
+
+    PTR_ADDIU  "%[pPred], %[pPred], 0x3                   \n\t"
+    "dsll       $11, %[kiStride], 0x2                     \n\t"
+    PTR_ADDU   "$11, $11, %[pPred]                        \n\t"
+    "lbu        $8, 0x0($11)                              \n\t"
+    PTR_ADDU   "$9, $9, $8                                \n\t"
+    "dsll       $9, $9, 0x4                               \n\t"
+
+    PTR_ADDIU  "%[pPred], %[pPred], -0x3                  \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
+    "xor        $f16, $f16, $f16                          \n\t"
+    "punpckhbh  $f0, $f0, $f16                            \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "punpckhbh  $f28, $f28, $f16                          \n\t"
+    "pmullh     $f28, $f28, $f24                          \n\t"
+    "psubh      $f28, $f28, $f0                           \n\t"
+
+    "xor        $f30, $f30, $f30                          \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+    "dmfc1      $8, $f28                                  \n\t"
+    "seh        $8, $8                                    \n\t"
+
+    "mul        $8, $8, 0x11                              \n\t"
+    PTR_ADDIU  "$8, $8, 0x10                              \n\t"
+    "sra        $8, $8, 0x5                               \n\t"
+    MMI_Copy8Times($f16, $f18, $f8, $8)
+
+    "move       %[pPred], $10                             \n\t"
+    PTR_ADDIU  "$9, $9, 0x10                              \n\t"
+    "mul        $8, $8, -0x3                              \n\t"
+    PTR_ADDU   "$9, $9, $8                                \n\t"
+    MMI_Copy8Times($f0, $f2, $f8, $9)
+
+    "xor        $8, $8, $8                                \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[mmi_plane_mul_b_c])     \n\t"
+
+    "dli        $11, 0x5                                  \n\t"
+    "dmtc1      $11, $f30                                 \n\t"
+    "1:                                                   \n\t"
+    "pmullh     $f8, $f4, $f20                            \n\t"
+    "pmullh     $f10, $f6, $f22                           \n\t"
+    "paddh      $f8, $f8, $f0                             \n\t"
+    "paddh      $f10, $f10, $f2                           \n\t"
+    "psrah      $f8, $f8, $f30                            \n\t"
+    "psrah      $f10, $f10, $f30                          \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "gssdxc1    $f8, 0x0(%[pPred], $0)                    \n\t"
+    "paddh      $f0, $f0, $f16                            \n\t"
+    "paddh      $f2, $f2, $f18                            \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    PTR_ADDIU  "$8, $8, 0x1                               \n\t"
+    PTR_ADDIU  "$11, $8, -0x8                             \n\t"
+    "bnez       $11, 1b                                   \n\t"
+    "nop                                                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred)
+    : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
+      [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
+    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsDecoderIChromaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "move       $10, %[pPred]                             \n\t"
+
+    PTR_SUBU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gsldxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $8, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    "dmtc1      $8, $f2                                   \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $8, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "lbu        $9, -0x1(%[pPred])                        \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+
+    "xor        $f8, $f8, $f8                             \n\t"
+    "punpcklwd  $f6, $f0, $f8                             \n\t"
+    "punpckhwd  $f0, $f0, $f8                             \n\t"
+    "pasubub    $f0, $f0, $f8                             \n\t"
+    "pasubub    $f6, $f6, $f8                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+
+    "paddd      $f6, $f6, $f2                             \n\t"
+    "paddd      $f2, $f4, $f0                             \n\t"
+
+    "dli        $8, 0x2                                   \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "gsldxc1    $f12, 0x0(%[mmi_01bytes], $0)             \n\t"
+    "dli        $8, 0x3                                   \n\t"
+    "dmtc1      $8, $f10                                  \n\t"
+
+    "paddd      $f0, $f0, $f8                             \n\t"
+    "dsrl       $f0, $f0, $f8                             \n\t"
+
+    "paddd      $f4, $f4, $f8                             \n\t"
+    "dsrl       $f4, $f4, $f8                             \n\t"
+
+    "paddd      $f6, $f6, $f8                             \n\t"
+    "paddd      $f6, $f6, $f8                             \n\t"
+    "dsrl       $f6, $f6, $f10                            \n\t"
+
+    "paddd      $f2, $f2, $f8                             \n\t"
+    "paddd      $f2, $f2, $f8                             \n\t"
+    "dsrl       $f2, $f2, $f10                            \n\t"
+
+    "dli        $8, 0x20                                  \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "pmuluw     $f0, $f0, $f12                            \n\t"
+    "pmuluw     $f6, $f6, $f12                            \n\t"
+    "dsll       $f0, $f0, $f8                             \n\t"
+    "xor        $f0, $f0, $f6                             \n\t"
+
+    "pmuluw     $f4, $f4, $f12                            \n\t"
+    "pmuluw     $f2, $f2, $f12                            \n\t"
+    "dsll       $f2, $f2, $f8                             \n\t"
+    "xor        $f2, $f2, $f4                             \n\t"
+
+    "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f0, 0x0($10, $0)                         \n\t"
+
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
+    PTR_ADDU   "$10, $10, %[kiStride]                     \n\t"
+    "gssdxc1    $f2, 0x0($10, $0)                         \n\t"
+    : [pPred] "+&r"((unsigned char *)pPred)
+    : [kiStride] "r"((int)kiStride),
+      [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12"
+  );
+}
+
+void WelsDecoderIChromaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dli        $8, 0x4e                                  \n\t"
+    "dmtc1      $8, $f16                                  \n\t"
+    "dli        $8, 0xb1                                  \n\t"
+    "dmtc1      $8, $f18                                  \n\t"
+    "dli        $8, 0x2                                   \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    PTR_SUBU   "$8, %[pPred], %[kiStride]                 \n\t"
+    "gsldxc1    $f0, 0x0($8, $0)                          \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "punpckhbh  $f2, $f0, $f28                            \n\t"
+    "punpcklbh  $f0, $f0, $f28                            \n\t"
+    "pshufh     $f4, $f0, $f16                            \n\t"
+    "pshufh     $f6, $f2, $f16                            \n\t"
+    "paddh      $f0, $f0, $f4                             \n\t"
+    "paddh      $f2, $f2, $f6                             \n\t"
+
+    "pshufh     $f8, $f0, $f18                            \n\t"
+    "pshufh     $f14, $f2, $f18                           \n\t"
+    "paddh      $f2, $f2, $f14                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+
+    "gslqc1     $f26, $f24, 0x0(%[mmi_wd_0x02])           \n\t"
+    "paddh      $f0, $f0, $f24                            \n\t"
+    "paddh      $f2, $f2, $f26                            \n\t"
+    "psrah      $f0, $f0, $f20                            \n\t"
+    "psrah      $f2, $f2, $f20                            \n\t"
+    "packushb   $f0, $f0, $f2                             \n\t"
+
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    PTR_ADDU   "%[pPred], %[pPred], %[kiStride]           \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    : [pPred] "+&r"((unsigned char *)pPred)
+    : [kiStride] "r"((int)kiStride), [mmi_wd_0x02] "r"((short *)mmi_wd_0x02)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+  );
+  RECOVER_REG;
+}
+
+void WelsDecoderI4x4LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "gsldxc1    $f8, 0x0(%[mmi_01bytes], $0)              \n\t"
+    "lbu        $8, -0x1(%[pPred])                        \n\t"
+    "dmtc1      $8, $f0                                   \n\t"
+    "pmuluw     $f0, $f0, $f8                             \n\t"
+
+    PTR_ADDU   "$9, %[pPred], %[kiStride]                 \n\t"
+    "lbu        $8, -0x1($9)                              \n\t"
+    "dmtc1      $8, $f2                                   \n\t"
+    "pmuluw     $f2, $f2, $f8                             \n\t"
+
+    PTR_ADDU   "$10, $9, %[kiStride]                      \n\t"
+    "lbu        $8, -0x1($10)                             \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+    "pmuluw     $f4, $f4, $f8                             \n\t"
+
+    PTR_ADDU   "$11, $10, %[kiStride]                     \n\t"
+    "lbu        $8, -0x1($11)                             \n\t"
+    "dmtc1      $8, $f6                                   \n\t"
+    "pmuluw     $f6, $f6, $f8                             \n\t"
+
+    "gsswxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    "gsswxc1    $f2, 0x0($9, $0)                          \n\t"
+    "gsswxc1    $f4, 0x0($10, $0)                         \n\t"
+    "gsswxc1    $f6, 0x0($11, $0)                         \n\t"
+    : [pPred] "+&r"((unsigned char *)pPred)
+    : [kiStride] "r"((int)kiStride),
+      [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
+    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8"
+  );
+}
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -1023,6 +1023,23 @@
 #endif
 
 #endif
+
+#if defined(HAVE_MMI)
+  if (uiCpuFlag & WELS_CPU_MMI) {
+    pCtx->pIdctResAddPredFunc   = IdctResAddPred_mmi;
+
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_mmi;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_mmi;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsDecoderI16x16LumaPredH_mmi;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsDecoderI16x16LumaPredV_mmi;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsDecoderI16x16LumaPredDcTop_mmi;
+    pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsDecoderI16x16LumaPredDcNA_mmi;
+    pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsDecoderIChromaPredPlane_mmi;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsDecoderIChromaPredDc_mmi;
+    pCtx->pGetIChromaPredFunc[C_PRED_DC_T]    = WelsDecoderIChromaPredDcTop_mmi;
+    pCtx->pGetI4x4LumaPredFunc[I4_PRED_H]     = WelsDecoderI4x4LumaPredH_mmi;
+  }
+#endif//HAVE_MMI
 }
 
 //reset decoder number related statistics info
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -53,10 +53,22 @@
 endif
 OBJS += $(DECODER_OBJSARM64)
 
+DECODER_ASM_MIPS_SRCS=\
+	$(DECODER_SRCDIR)/core/mips/dct_mmi.c\
+
+DECODER_OBJSMIPS += $(DECODER_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+DECODER_OBJS += $(DECODER_OBJSMIPS)
+endif
+OBJS += $(DECODER_OBJSMIPS)
+
 OBJS += $(DECODER_OBJS)
 
 $(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c $(CXX_O) $<
+
+$(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c $(CXX_O) $<
 
 $(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.asm
 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $@ $<
--- /dev/null
+++ b/codec/processing/src/mips/vaa_mmi.c
@@ -1,0 +1,892 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    vaa_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+//f4 is 0x1, f6 is 0x8
+#define WELS_MAX_REG_MMI(f0, f2, f4, f6) \
+  "punpckhwd  $f4, "#f0", "#f0"    \n\t" \
+  "punpckhwd  $f6, "#f2", "#f2"    \n\t" \
+  "pmaxub     "#f0", "#f0", $f4    \n\t" \
+  "pmaxub     "#f2", "#f2", $f6    \n\t" \
+  "pshufh     $f4, "#f0", "#f4"    \n\t" \
+  "pshufh     $f6, "#f2", "#f4"    \n\t" \
+  "pmaxub     "#f0", "#f0", $f4    \n\t" \
+  "pmaxub     "#f2", "#f2", $f6    \n\t" \
+  "dsrl       $f4, "#f0", "#f6"    \n\t" \
+  "dsrl       $f6, "#f2", "#f6"    \n\t" \
+  "pmaxub     "#f0", "#f0", $f4    \n\t" \
+  "pmaxub     "#f2", "#f2", $f6    \n\t"
+
+#define WELS_SAD_SD_MAD_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
+  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
+  "pasubub    $f12, $f4, $f0                      \n\t" \
+  "pasubub    $f14, $f6, $f2                      \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      "#f4", "#f4", $f12                  \n\t" \
+  "paddw      "#f6", "#f6", $f14                  \n\t" \
+  "pasubub    $f12, $f8, $f0                      \n\t" \
+  "pasubub    $f14, $f10, $f2                     \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      "#f8", "#f8", $f12                  \n\t" \
+  "paddw      "#f10", "#f10", $f14                \n\t" \
+  "pasubub    $f12, $f4, $f8                      \n\t" \
+  "pasubub    $f14, $f6, $f10                     \n\t" \
+  "pmaxub     "#f12", "#f12", $f12                \n\t" \
+  "pmaxub     "#f14", "#f14", $f14                \n\t" \
+  "pasubub    $f12, $f12, $f0                     \n\t" \
+  "pasubub    $f14, $f14, $f2                     \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      "#f0", "#f0", $f12                  \n\t" \
+  "paddw      "#f2", "#f2", $f14                  \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
+  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
+
+#define WELS_SAD_16x2_MMI(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, r1, r2, r3) \
+  "gslqc1     "#f1",  "#f2",  0x00("#r1")         \n\t" \
+  "gslqc1     "#f3",  "#f4",  0x00("#r2")         \n\t" \
+  PTR_ADDU    ""#r1", "#r1",  "#r3"               \n\t" \
+  "gslqc1     "#f5",  "#f6",  0x00("#r1")         \n\t" \
+  PTR_ADDU    ""#r2", "#r2",  "#r3"               \n\t" \
+  "gslqc1     "#f7",  "#f8",  0x00("#r2")         \n\t" \
+  "pasubub    "#f1",  "#f1",  "#f3"               \n\t" \
+  "pasubub    "#f2",  "#f2",  "#f4"               \n\t" \
+  "biadd      "#f1",  "#f1"                       \n\t" \
+  "biadd      "#f2",  "#f2"                       \n\t" \
+  "pasubub    "#f5",  "#f5",  "#f7"               \n\t" \
+  "pasubub    "#f6",  "#f6",  "#f8"               \n\t" \
+  "biadd      "#f5",  "#f5"                       \n\t" \
+  "biadd      "#f6",  "#f6"                       \n\t" \
+  "paddw      "#f9",  "#f9",  "#f1"               \n\t" \
+  "paddw      "#f9",  "#f9",  "#f5"               \n\t" \
+  "paddw      "#f10", "#f10", "#f2"               \n\t" \
+  "paddw      "#f10", "#f10", "#f6"               \n\t" \
+  PTR_ADDU    ""#r1", "#r1",  "#r3"               \n\t" \
+  PTR_ADDU    ""#r2", "#r2",  "#r3"               \n\t"
+
+#define WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI(r0, r1, r2) \
+  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
+  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
+  "pasubub    $f12, $f4, $f8                      \n\t" \
+  "pasubub    $f14, $f6, $f10                     \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      $f28, $f28, $f12                    \n\t" \
+  "paddw      $f30, $f30, $f14                    \n\t" \
+  "pasubub    $f12, $f4, $f8                      \n\t" \
+  "pasubub    $f14, $f6, $f10                     \n\t" \
+  "pasubub    $f8, $f4, $f0                       \n\t" \
+  "pasubub    $f10, $f6, $f2                      \n\t" \
+  "biadd      $f8, $f8                            \n\t" \
+  "biadd      $f10, $f10                          \n\t" \
+  "paddw      $f24, $f24, $f8                     \n\t" \
+  "paddw      $f26, $f26, $f10                    \n\t" \
+  "punpcklbh  $f8, $f6, $f2                       \n\t" \
+  "punpckhbh  $f10, $f6, $f2                      \n\t" \
+  "punpckhbh  $f6, $f4, $f0                       \n\t" \
+  "punpcklbh  $f4, $f4, $f0                       \n\t" \
+  "pmaddhw    $f4, $f4, $f4                       \n\t" \
+  "pmaddhw    $f6, $f6, $f6                       \n\t" \
+  "pmaddhw    $f8, $f8, $f8                       \n\t" \
+  "pmaddhw    $f10, $f10, $f10                    \n\t" \
+  "paddw      $f20, $f20, $f4                     \n\t" \
+  "paddw      $f22, $f22, $f6                     \n\t" \
+  "paddw      $f20, $f20, $f8                     \n\t" \
+  "paddw      $f22, $f22, $f10                    \n\t" \
+  "punpcklbh  $f4, $f12, $f0                      \n\t" \
+  "punpckhbh  $f6, $f12, $f0                      \n\t" \
+	"punpcklbh  $f12, $f14, $f2                     \n\t" \
+	"punpckhbh  $f14, $f14, $f2                     \n\t" \
+  "pmaddhw    $f4, $f4, $f4                       \n\t" \
+  "pmaddhw    $f6, $f6, $f6                       \n\t" \
+  "pmaddhw    $f12, $f12, $f12                    \n\t" \
+  "pmaddhw    $f14, $f14, $f14                    \n\t" \
+  "paddw      $f16, $f16, $f4                     \n\t" \
+  "paddw      $f18, $f18, $f6                     \n\t" \
+  "paddw      $f16, $f16, $f12                    \n\t" \
+  "paddw      $f18, $f18, $f14                    \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
+  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
+
+#define WELS_SAD_BGD_SQDIFF_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
+  "punpcklbh  $f8, $f4, $f0                       \n\t" \
+  "punpckhbh  $f10, $f4, $f0                      \n\t" \
+  "punpcklbh  $f12, $f6, $f2                      \n\t" \
+  "punpckhbh  $f14, $f6, $f2                      \n\t" \
+  "pmaddhw    $f8, $f8, $f8                       \n\t" \
+  "pmaddhw    $f10, $f10, $f10                    \n\t" \
+  "pmaddhw    $f12, $f12, $f12                    \n\t" \
+  "pmaddhw    $f14, $f14, $f14                    \n\t" \
+  "paddw      $f8, $f8, $f12                      \n\t" \
+  "paddw      $f10, $f10, $f14                    \n\t" \
+  "punpckhwd  $f12, $f0, $f8                      \n\t" \
+  "punpckhwd  $f14, $f0, $f10                     \n\t" \
+  "punpcklwd  $f8, $f0, $f8                       \n\t" \
+  "punpcklwd  $f10, $f0, $f10                     \n\t" \
+  "paddw      $f8, $f8, $f12                      \n\t" \
+  "paddw      $f10, $f10, $f14                    \n\t" \
+  "paddw      "#f0", "#f0", $f8                   \n\t" \
+  "paddw      "#f2", "#f2", $f10                  \n\t" \
+  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
+  "pasubub    $f12, $f4, $f0                      \n\t" \
+  "pasubub    $f14, $f6, $f2                      \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      "#f4", "#f4", $f12                  \n\t" \
+  "paddw      "#f6", "#f6", $f14                  \n\t" \
+  "pasubub    $f12, $f8, $f0                      \n\t" \
+  "pasubub    $f14, $f10, $f2                     \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "punpcklwd  $f14, $f14, $f14                    \n\t" \
+  "punpckhwd  $f14, $f12, $f14                    \n\t" \
+  "punpcklwd  $f12, $f0, $f12                     \n\t" \
+  "paddw      "#f4", "#f4", $f12                  \n\t" \
+  "paddw      "#f6", "#f6", $f14                  \n\t" \
+  "pasubub    $f12, $f4, $f8                      \n\t" \
+  "pasubub    $f14, $f6, $f10                     \n\t" \
+  "pmaxub     "#f8", "#f8", $f12                  \n\t" \
+  "pmaxub     "#f10", "#f10", $f14                \n\t" \
+  "paddw      $f4, $f0, $f12                      \n\t" \
+  "paddw      $f6, $f0, $f14                      \n\t" \
+  "pasubub    $f12, $f12, $f0                     \n\t" \
+  "pasubub    $f14, $f14, $f2                     \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      "#f0", "#f0", $f12                  \n\t" \
+  "paddw      "#f2", "#f2", $f14                  \n\t" \
+  "paddw      $f12, $f0, $f4                      \n\t" \
+  "paddw      $f14, $f0, $f6                      \n\t" \
+  "punpcklbh  $f4, $f12, $f0                      \n\t" \
+  "punpckhbh  $f6, $f12, $f0                      \n\t" \
+  "punpcklbh  $f12, $f14, $f2                     \n\t" \
+  "punpckhbh  $f14, $f14, $f2                     \n\t" \
+  "pmaddhw    $f4, $f4, $f4                       \n\t" \
+  "pmaddhw    $f6, $f6, $f6                       \n\t" \
+  "pmaddhw    $f12, $f12, $f12                    \n\t" \
+  "pmaddhw    $f14, $f14, $f14                    \n\t" \
+  "paddw      "#f12", "#f12", $f4                 \n\t" \
+  "paddw      "#f14", "#f14", $f6                 \n\t" \
+  "paddw      "#f12", "#f12", $f12                \n\t" \
+  "paddw      "#f14", "#f14", $f14                \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
+  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
+
+#define WELS_SAD_SUM_SQSUM_16x1_MMI(r0, r1, r2) \
+  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
+  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
+  "pasubub    $f12, $f4, $f8                      \n\t" \
+  "pasubub    $f14, $f6, $f10                     \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      $f24, $f24, $f12                    \n\t" \
+  "paddw      $f26, $f26, $f14                    \n\t" \
+  "pasubub    $f12, $f4, $f0                      \n\t" \
+  "pasubub    $f14, $f6, $f2                      \n\t" \
+  "biadd      $f12, $f12                          \n\t" \
+  "biadd      $f14, $f14                          \n\t" \
+  "paddw      $f20, $f20, $f12                    \n\t" \
+  "paddw      $f22, $f22, $f14                    \n\t" \
+  "punpcklbh  $f8, $f6, $f2                       \n\t" \
+  "punpckhbh  $f10, $f6, $f2                      \n\t" \
+  "punpckhbh  $f6, $f4, $f0                       \n\t" \
+  "punpcklbh  $f4, $f4, $f0                       \n\t" \
+  "pmaddhw    $f4, $f4, $f4                       \n\t" \
+  "pmaddhw    $f6, $f6, $f6                       \n\t" \
+  "pmaddhw    $f8, $f8, $f8                       \n\t" \
+  "pmaddhw    $f10, $f10, $f10                    \n\t" \
+  "paddw      $f16, $f16, $f4                     \n\t" \
+  "paddw      $f18, $f18, $f6                     \n\t" \
+  "paddw      $f16, $f16, $f8                     \n\t" \
+  "paddw      $f18, $f18, $f10                    \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
+  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"
+
+void VAACalcSad_mmi(const uint8_t* pCurData, const uint8_t* pRefData,
+                    int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                    int32_t* pFrameSad, int32_t* pSad8x8) {
+  double ftmp[13];
+  uint64_t tmp[2];
+  mips_reg addr[3];
+
+  __asm__ volatile (
+    ".set       arch=loongson3a                                     \n\t"
+    PTR_SRL    "%[iPicWidth],   %[iPicWidth],   0x04                \n\t"
+    PTR_SRL    "%[iPicHeight],  %[iPicHeight],  0x04                \n\t"
+    "move       %[addr2],       %[iPicStride]                       \n\t"
+    PTR_SLL    "%[iPicStride],  %[iPicStride],  0x04                \n\t"
+    "xor        %[ftmp0],       %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp11],      %[ftmp11],      %[ftmp11]           \n\t"
+    "xor        %[ftmp12],      %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                             \n\t"
+    "move       %[addr0],       %[pCurData]                         \n\t"
+    "move       %[addr1],       %[pRefData]                         \n\t"
+    "move       %[tmp0],        %[iPicWidth]                        \n\t"
+    "2:                                                             \n\t"
+    "xor        %[ftmp9],       %[ftmp9],       %[ftmp9]            \n\t"
+    "xor        %[ftmp10],      %[ftmp10],      %[ftmp10]           \n\t"
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    "paddw      %[ftmp11],      %[ftmp11],      %[ftmp9]            \n\t"
+    "paddw      %[ftmp12],      %[ftmp12],      %[ftmp10]           \n\t"
+    "swc1       %[ftmp10],      0x00(%[pSad8x8])                    \n\t"
+    "swc1       %[ftmp9],       0x04(%[pSad8x8])                    \n\t"
+
+    "xor        %[ftmp9],       %[ftmp9],       %[ftmp9]            \n\t"
+    "xor        %[ftmp10],      %[ftmp10],      %[ftmp10]           \n\t"
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+                      %[addr0], %[addr1], %[addr2])
+    "paddw      %[ftmp11],      %[ftmp11],      %[ftmp9]            \n\t"
+    "paddw      %[ftmp12],      %[ftmp12],      %[ftmp10]           \n\t"
+    "swc1       %[ftmp10],      0x08(%[pSad8x8])                    \n\t"
+    "swc1       %[ftmp9],       0x0c(%[pSad8x8])                    \n\t"
+
+    PTR_ADDU   "%[pSad8x8],     %[pSad8x8],     0x10                \n\t"
+    PTR_SUBU   "%[addr0],       %[addr0],       %[iPicStride]       \n\t"
+    PTR_SUBU   "%[addr1],       %[addr1],       %[iPicStride]       \n\t"
+    PTR_ADDI   "%[tmp0],        %[tmp0],        -0x01               \n\t"
+    PTR_ADDU   "%[addr0],       %[addr0],       0x10                \n\t"
+    PTR_ADDU   "%[addr1],       %[addr1],       0x10                \n\t"
+    "bnez       %[tmp0],        2b                                  \n\t"
+
+    PTR_ADDI   "%[iPicHeight],  %[iPicHeight],  -0x01               \n\t"
+    PTR_ADDU   "%[pCurData],    %[pCurData],    %[iPicStride]       \n\t"
+    PTR_ADDU   "%[pRefData],    %[pRefData],    %[iPicStride]       \n\t"
+    "bnez       %[iPicHeight],  1b                                  \n\t"
+
+    "paddw      %[ftmp11],      %[ftmp11],      %[ftmp12]           \n\t"
+    "swc1       %[ftmp11],      0x00(%[pFrameSad])                  \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [pCurData]"+&r"(pCurData),        [pRefData]"+&r"(pRefData),
+      [iPicHeight]"+&r"(iPicHeight),    [iPicWidth]"+&r"(iPicWidth),
+      [pSad8x8]"+&r"(pSad8x8),          [iPicStride]"+&r"(iPicStride),
+      [addr2]"=&r"(addr[2])
+    : [pFrameSad]"r"(pFrameSad)
+    : "memory"
+  );
+}
+
+void VAACalcSadBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+                       int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                       int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8,
+                       uint8_t *p_mad8x8) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "move       $15, %[cur_data]                          \n\t"
+    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
+    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
+    "dsll       $13, %[iPicStride], 0x4                   \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "xor        $f2, $f2, $f2                             \n\t"
+    "xor        $14, $14, $14                             \n\t"
+    "1:                                                   \n\t"
+    "move       $9, %[iPicWidth]                          \n\t"
+    "move       $10, $15                                  \n\t"
+    "move       $11, %[ref_data]                          \n\t"
+    "2:                                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+
+    "dli        $8, 0x1                                   \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "dli        $8, 0x8                                   \n\t"
+    "dmtc1      $8, $f10                                  \n\t"
+    WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
+
+    "dmfc1      $8, $f16                                  \n\t"
+    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
+    "dmfc1      $8, $f18                                  \n\t"
+    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
+    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
+
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    "punpcklwd  $f30, $f30, $f30                          \n\t"
+    "punpcklwd  $f26, $f26, $f26                          \n\t"
+    "punpcklwd  $f22, $f22, $f22                          \n\t"
+
+    "punpckhwd  $f30, $f28, $f30                          \n\t"
+    "punpckhwd  $f26, $f24, $f26                          \n\t"
+    "punpckhwd  $f22, $f20, $f22                          \n\t"
+
+    "punpcklwd  $f28, $f16, $f28                          \n\t"
+    "punpcklwd  $f24, $f16, $f24                          \n\t"
+    "punpcklwd  $f20, $f16, $f20                          \n\t"
+
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+                             $15, %[ref_data], %[iPicStride])
+
+    "dli        $8, 0x1                                   \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "dli        $8, 0x8                                   \n\t"
+    "dmtc1      $8, $f10                                  \n\t"
+    WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
+
+    "dmfc1      $8, $f16                                  \n\t"
+    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
+    "dmfc1      $8, $f18                                  \n\t"
+    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
+    "punpckhwd  $f4, $f28, $f30                           \n\t"
+    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
+
+    "punpcklwd  $f6, $f28, $f30                           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[psad8x8])                 \n\t"
+    PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x10              \n\t"
+
+    "paddw      $f6, $f6, $f30                            \n\t"
+    "paddw      $f4, $f4, $f28                            \n\t"
+    "punpckhwd  $f8, $f6, $f6                             \n\t"
+    "paddw      $f4, $f4, $f8                             \n\t"
+    "dmtc1      $14, $f6                                  \n\t"
+    "paddw      $f6, $f6, $f4                             \n\t"
+    "dmfc1      $14, $f6                                  \n\t"
+
+    "psubw      $f24, $f24, $f20                          \n\t"
+    "psubw      $f26, $f26, $f22                          \n\t"
+    "punpckhwd  $f4, $f24, $f26                           \n\t"
+    "punpcklwd  $f6, $f24, $f26                           \n\t"
+    "gssqc1     $f6, $f4, 0x0(%[p_sd8x8])                 \n\t"
+    PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x10              \n\t"
+
+    PTR_SUBU   "$15, $15, $13                             \n\t"
+    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
+    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
+    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
+
+    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
+    "bnez       %[iPicWidth], 2b                          \n\t"
+    "move       %[iPicWidth], $9                          \n\t"
+    "move       $15, $10                                  \n\t"
+    "move       %[ref_data], $11                          \n\t"
+    PTR_ADDU   "$15, $15, $13                             \n\t"
+    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
+
+    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
+    "bnez       %[iPicHeight], 1b                         \n\t"
+
+    "swl        $14, 0x3(%[psadframe])                    \n\t"
+    "swr        $14, 0x0(%[psadframe])                    \n\t"
+    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+      [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
+      [p_sd8x8]"+&r"((int *)p_sd8x8), [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
+    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+      [psadframe]"r"((int *)psadframe)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void VAACalcSadSsd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+                       int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                       int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
+                       int32_t *psqsum16x16, int32_t *psqdiff16x16) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "move       $15, %[cur_data]                          \n\t"
+    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
+    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
+    "dsll       $13, %[iPicStride], 0x4                   \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "xor        $f2, $f2, $f2                             \n\t"
+    "xor        $12, $12, $12                             \n\t"
+    "xor        $14, $14, $14                             \n\t"
+    "1:                                                   \n\t"
+    "move       $9, %[iPicWidth]                          \n\t"
+    "move       $10, $15                                  \n\t"
+    "move       $11, %[ref_data]                          \n\t"
+    "2:                                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    "dmfc1      $8, $f28                                  \n\t"
+    "sw         $8, 0x0(%[psad8x8])                       \n\t"
+    "dmfc1      $8, $f30                                  \n\t"
+    "sw         $8, 0x4(%[psad8x8])                       \n\t"
+    "paddw      $f4, $f28, $f30                           \n\t"
+    "dmfc1      $12, $f4                                  \n\t"
+	  PTR_ADDU   "$14, $14, $12                             \n\t"
+
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+    "dmfc1      $8, $f28                                  \n\t"
+    "sw         $8, 0x8(%[psad8x8])                       \n\t"
+    "dmfc1      $8, $f30                                  \n\t"
+    "paddw      $f4, $f28, $f30                           \n\t"
+    "sw         $8, 0xc(%[psad8x8])                       \n\t"
+    "dmfc1      $12, $f4                                  \n\t"
+	  PTR_ADDU   "$14, $14, $12                             \n\t"
+    PTR_ADDIU  "%[psad8x8],   %[psad8x8],   0x10          \n\t"
+
+    "paddw      $f24, $f24, $f26                          \n\t"
+    "dmfc1      $8, $f24                                  \n\t"
+    "sw         $8, 0x0(%[psum16x16])                     \n\t"
+    PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
+    "paddw      $f24, $f20, $f22                          \n\t"
+	  "punpcklwd  $f20, $f24, $f24                          \n\t"
+	  "punpckhwd  $f22, $f24, $f24                          \n\t"
+    "paddw      $f20, $f20, $f22                          \n\t"
+    "dmfc1      $8, $f20                                  \n\t"
+    "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
+    PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"
+
+    "paddw      $f20, $f16, $f18                          \n\t"
+	  "punpcklwd  $f16, $f20, $f20                          \n\t"
+	  "punpckhwd  $f18, $f20, $f20                          \n\t"
+    "paddw      $f16, $f16, $f18                          \n\t"
+    "dmfc1      $8, $f16                                  \n\t"
+    "sw         $8, 0x0(%[psqdiff16x16])                  \n\t"
+    PTR_ADDIU  "%[psqdiff16x16], %[psqdiff16x16], 0x4     \n\t"
+
+    PTR_SUBU   "$15, $15, $13                             \n\t"
+    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
+    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
+    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
+
+    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
+    "bnez       %[iPicWidth], 2b                          \n\t"
+    "nop                                                  \n\t"
+    "move       %[iPicWidth], $9                          \n\t"
+    "move       $15, $10                                  \n\t"
+    "move       %[ref_data], $11                          \n\t"
+    PTR_ADDU   "$15, $15, $13                             \n\t"
+    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
+
+    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
+    "bnez       %[iPicHeight], 1b                         \n\t"
+    "nop                                                  \n\t"
+
+    "sw         $14, 0x0(%[psadframe])                    \n\t"
+    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+      [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
+      [psqsum16x16]"+&r"((int *)psqsum16x16), [psqdiff16x16]"+&r"((int *)psqdiff16x16)
+    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+      [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void VAACalcSadSsdBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+                          int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                          int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
+                          int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8,
+                          uint8_t *p_mad8x8) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "move       $15, %[cur_data]                          \n\t"
+    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
+    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
+    "dsll       $13, %[iPicStride], 0x4                   \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "xor        $f2, $f2, $f2                             \n\t"
+    "xor        $12, $12, $12                             \n\t"
+    "xor        $14, $14, $14                             \n\t"
+    "1:                                                   \n\t"
+    "move       $9, %[iPicWidth]                          \n\t"
+    "move       $10, $15                                  \n\t"
+    "move       $11, %[ref_data]                          \n\t"
+    "2:                                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+
+    "dmfc1      $8, $f28                                  \n\t"
+    "sw         $8, 0x0(%[psad8x8])                       \n\t"
+    "dmfc1      $8, $f30                                  \n\t"
+    "sw         $8, 0x4(%[psad8x8])                       \n\t"
+    PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x8               \n\t"
+
+    "paddw      $f4, $f28, $f30                           \n\t"
+    "dmfc1      $12, $f4                                  \n\t"
+    PTR_ADDU   "$14, $14,  $12                            \n\t"
+
+    "paddw      $f4, $f24, $f26                           \n\t"
+    "dmfc1      $8, $f4                                   \n\t"
+    "sw         $8, 0x0(%[psum16x16])                     \n\t"
+
+    "punpckhwd  $f4, $f24, $f26                           \n\t"
+    "punpcklwd  $f6, $f24, $f26                           \n\t"
+    "psubw      $f6, $f6, $f4                             \n\t"
+    "dmfc1      $8, $f6                                   \n\t"
+    PTR_S      "$8, 0x0(%[p_sd8x8])                       \n\t"
+    PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x8               \n\t"
+
+    "dli        $8, 0x1                                   \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "dli        $8, 0x8                                   \n\t"
+    "dmtc1      $8, $f10                                  \n\t"
+    WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
+
+    "dmfc1      $8, $f20                                  \n\t"
+    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
+    "dmfc1      $8, $f22                                  \n\t"
+    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
+    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
+
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "punpckhwd  $f28, $f20, $f28                          \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "punpckhwd  $f30, $f20, $f30                          \n\t"
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+                                 $f18, $15, %[ref_data], %[iPicStride])
+
+    "dmfc1      $8, $f28                                  \n\t"
+    "sw         $8, 0x0(%[psad8x8])                       \n\t"
+    "dmfc1      $8, $f30                                  \n\t"
+    "sw         $8, 0x4(%[psad8x8])                       \n\t"
+    PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x8               \n\t"
+
+    "paddw      $f4, $f28, $f30                           \n\t"
+    "dmfc1      $12, $f4                                  \n\t"
+    PTR_ADDU   "$14, $14, $12                             \n\t"
+
+    "paddw      $f4, $f24, $f26                           \n\t"
+    "dmfc1      $8, $f4                                   \n\t"
+    "lw         $12, 0x0(%[psum16x16])                    \n\t"
+    PTR_ADDU   "$8, $8, $12                               \n\t"
+    "sw         $8, 0x0(%[psum16x16])                     \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
+
+    "punpckhwd  $f30, $f30, $f8                           \n\t"
+    "punpckhwd  $f28, $f28, $f8                           \n\t"
+    "paddw      $f8, $f28, $f30                           \n\t"
+    "dmfc1      $8, $f8                                   \n\t"
+    "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
+    PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"
+
+    "punpckhwd  $f4, $f24, $f26                           \n\t"
+    "punpcklwd  $f6, $f24, $f26                           \n\t"
+    "psubw      $f6, $f6, $f4                             \n\t"
+    "dmfc1      $8, $f6                                   \n\t"
+    PTR_S      "$8, 0x0(%[p_sd8x8])                       \n\t"
+    PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x8               \n\t"
+
+    "dli        $8, 0x1                                   \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "dli        $8, 0x8                                   \n\t"
+    "dmtc1      $8, $f10                                  \n\t"
+    WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
+
+    "dmfc1      $8, $f20                                  \n\t"
+    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
+    "dmfc1      $8, $f22                                  \n\t"
+    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
+    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"
+
+    "paddw      $f20, $f16, $f18                          \n\t"
+	  "punpcklwd  $f16, $f20, $f20                          \n\t"
+	  "punpckhwd  $f18, $f20, $f20                          \n\t"
+    "paddw      $f16, $f16, $f18                          \n\t"
+    "dmfc1      $8, $f16                                  \n\t"
+    "sw         $8, 0x0(%[psqdiff16x16])                  \n\t"
+    PTR_ADDIU  "%[psqdiff16x16], %[psqdiff16x16], 0x4     \n\t"
+
+    PTR_SUBU   "$15, $15, $13                             \n\t"
+    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
+    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
+    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
+
+    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
+    "bnez       %[iPicWidth], 2b                          \n\t"
+    "nop                                                  \n\t"
+    "move       %[iPicWidth], $9                          \n\t"
+    "move       $15, $10                                  \n\t"
+    "move       %[ref_data], $11                          \n\t"
+    PTR_ADDU   "$15, $15, $13                             \n\t"
+    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
+
+    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
+    "bnez       %[iPicHeight], 1b                         \n\t"
+    "nop                                                  \n\t"
+
+    "sw         $14, 0x0(%[psadframe])                    \n\t"
+    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+      [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
+      [psum16x16]"+&r"((int *)psum16x16), [psqsum16x16]"+&r"((int *)psqsum16x16),
+	    [psqdiff16x16]"+&r"((int *)psqdiff16x16), [p_sd8x8]"+&r"((int *)p_sd8x8),
+      [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
+    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+      [psadframe]"r"((int *)psadframe)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void VAACalcSadVar_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+                       int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+                       int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
+                       int32_t *psqsum16x16) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "move       $15, %[cur_data]                          \n\t"
+    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
+    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
+    "dsll       $13, %[iPicStride], 0x4                   \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "xor        $f2, $f2, $f2                             \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "xor        $14, $14, $14                             \n\t"
+    "1:                                                   \n\t"
+    "move       $9, %[iPicWidth]                          \n\t"
+    "move       $10, $15                                  \n\t"
+    "move       $11, %[ref_data]                          \n\t"
+    "2:                                                   \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    "paddw      $f28, $f24, $f28                          \n\t"
+    "paddw      $f30, $f26, $f30                          \n\t"
+    "dmfc1      $8, $f24                                  \n\t"
+    "sw         $8, 0x0(%[psad8x8])                       \n\t"
+    "dmfc1      $8, $f26                                  \n\t"
+    "sw         $8, 0x4(%[psad8x8])                       \n\t"
+
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+    "paddw      $f28, $f24, $f28                          \n\t"
+    "paddw      $f30, $f26, $f30                          \n\t"
+    "dmfc1      $8, $f24                                  \n\t"
+    "sw         $8, 0x8(%[psad8x8])                       \n\t"
+    "dmfc1      $8, $f26                                  \n\t"
+    "sw         $8, 0xc(%[psad8x8])                       \n\t"
+    PTR_ADDIU  "%[psad8x8],   %[psad8x8],   0x10          \n\t"
+
+    "paddw      $f20, $f20, $f22                          \n\t"
+    "dmfc1      $8, $f20                                  \n\t"
+    "sw         $8, 0x0(%[psum16x16])                     \n\t"
+    PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
+
+    "paddw      $f20, $f16, $f18                          \n\t"
+	  "punpcklwd  $f16, $f20, $f20                          \n\t"
+	  "punpckhwd  $f18, $f20, $f20                          \n\t"
+    "paddw      $f16, $f16, $f18                          \n\t"
+    "dmfc1      $8, $f16                                  \n\t"
+    "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
+    PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"
+
+    PTR_SUBU   "$15, $15, $13                             \n\t"
+    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
+    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
+    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"
+
+    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
+    "bnez       %[iPicWidth], 2b                          \n\t"
+    "nop                                                  \n\t"
+    "move       %[iPicWidth], $9                          \n\t"
+    "move       $15, $10                                  \n\t"
+    "move       %[ref_data], $11                          \n\t"
+    PTR_ADDU   "$15, $15, $13                             \n\t"
+    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"
+
+    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
+    "bnez       %[iPicHeight], 1b                         \n\t"
+    "nop                                                  \n\t"
+
+    "paddw      $f28, $f28, $f30                          \n\t"
+    "dmfc1      $8, $f28                                  \n\t"
+    "sw         $8, 0x0(%[psadframe])                     \n\t"
+    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+      [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
+      [psqsum16x16]"+&r"((int *)psqsum16x16)
+    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+      [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -93,6 +93,16 @@
     sVaaFuncs.pfVAACalcSadVar    = VAACalcSadVar_AArch64_neon;
   }
 #endif//HAVE_NEON_AARCH64
+
+#ifdef HAVE_MMI
+  if ((iCpuFlag & WELS_CPU_MMI) == WELS_CPU_MMI) {
+    sVaaFuncs.pfVAACalcSad       = VAACalcSad_mmi;
+    sVaaFuncs.pfVAACalcSadBgd    = VAACalcSadBgd_mmi;
+    sVaaFuncs.pfVAACalcSadSsd    = VAACalcSadSsd_mmi;
+    sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_mmi;
+    sVaaFuncs.pfVAACalcSadVar    = VAACalcSadVar_mmi;
+  }
+#endif//HAVE_MMI
 }
 
 EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -132,6 +132,16 @@
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_MMI
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc       VAACalcSadBgd_mmi;
+VAACalcSadSsdBgdFunc    VAACalcSadSsdBgd_mmi;
+VAACalcSadFunc          VAACalcSad_mmi;
+VAACalcSadVarFunc       VAACalcSadVar_mmi;
+VAACalcSadSsdFunc       VAACalcSadSsd_mmi;
+WELSVP_EXTERN_C_END
+#endif
+
 class CVAACalculation : public IStrategy {
  public:
   CVAACalculation (int32_t iCpuFlag);
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -55,10 +55,22 @@
 endif
 OBJS += $(PROCESSING_OBJSARM64)
 
+PROCESSING_ASM_MIPS_SRCS=\
+	$(PROCESSING_SRCDIR)/src/mips/vaa_mmi.c\
+
+PROCESSING_OBJSMIPS += $(PROCESSING_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+PROCESSING_OBJS += $(PROCESSING_OBJSMIPS)
+endif
+OBJS += $(PROCESSING_OBJSMIPS)
+
 OBJS += $(PROCESSING_OBJS)
 
 $(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<
+
+$(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<
 
 $(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.asm
 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $@ $<
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -154,6 +154,10 @@
 GENERATE_IDCTRESADDPRED (IdctResAddPred_AArch64_neon, WELS_CPU_NEON)
 #endif
 
+#if defined(HAVE_MMI)
+GENERATE_IDCTRESADDPRED (IdctResAddPred_mmi, WELS_CPU_MMI)
+#endif
+
 #define GENERATE_SETNONZEROCOUNT(method, flag) \
 TEST(DecoderDecodeMbAux, method) \
 {\
--- a/test/decoder/DecUT_IntraPrediction.cpp
+++ b/test/decoder/DecUT_IntraPrediction.cpp
@@ -649,3 +649,16 @@
 GENERATE_8x8_UT (WelsDecoderIChromaPredPlane_AArch64_neon, WelsIChromaPredPlane_ref, 1, WELS_CPU_NEON)
 GENERATE_8x8_UT (WelsDecoderIChromaPredDcTop_AArch64_neon, WelsIChromaPredDcTop_ref, 1, WELS_CPU_NEON)
 #endif
+
+#if defined(HAVE_MMI)
+GENERATE_4x4_UT (WelsDecoderI4x4LumaPredH_mmi, LumaI4x4PredH, 1, WELS_CPU_MMI)
+GENERATE_8x8_UT (WelsDecoderIChromaPredDcTop_mmi, WelsIChromaPredDcTop_ref, 1, WELS_CPU_MMI)
+GENERATE_8x8_UT (WelsDecoderIChromaPredDc_mmi, WelsIChromaPredDc_ref, 1, WELS_CPU_MMI)
+GENERATE_8x8_UT (WelsDecoderIChromaPredPlane_mmi, WelsIChromaPredPlane_ref, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredPlane_mmi, WelsI16x16LumaPredPlane_ref, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredH_mmi, LumaI16x16PredH, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredV_mmi, LumaI16x16PredV, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredDc_mmi, LumaI16x16PredDC, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredDcTop_mmi, LumaI16x16PredDCTop, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredDcNA_mmi, LumaI16x16PredDCNone, 1, WELS_CPU_MMI)
+#endif
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -74,6 +74,39 @@
   FREE_MEMORY (iDct);
 }
 #endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsScan4x4Ac_mmi) {
+  CMemoryAlign cMemoryAlign (0);
+  ALLOC_MEMORY (int16_t, iLevelA, 16);
+  ALLOC_MEMORY (int16_t, iLevelB, 16);
+  ALLOC_MEMORY (int16_t, iDct, 16);
+  for (int i = 0; i < 16; i++) {
+    iDct[i] = rand() % 256 + 1;
+  }
+  WelsScan4x4Ac_c (iLevelA, iDct);
+  WelsScan4x4Ac_mmi (iLevelB, iDct);
+  for (int j = 0; j < 16; j++)
+    EXPECT_EQ (iLevelA[j], iLevelB[j]);
+  FREE_MEMORY (iLevelA);
+  FREE_MEMORY (iLevelB);
+  FREE_MEMORY (iDct);
+}
+TEST (EncodeMbAuxTest, WelsScan4x4DcAc_mmi) {
+  CMemoryAlign cMemoryAlign (0);
+  ALLOC_MEMORY (int16_t, iLevelA, 32);
+  ALLOC_MEMORY (int16_t, iLevelB, 32);
+  ALLOC_MEMORY (int16_t, iDct, 32);
+  for (int i = 0; i < 32; i++)
+    iDct[i] = (rand() & 32767) - 16384;
+  WelsScan4x4DcAc_mmi (iLevelA, iDct);
+  WelsScan4x4DcAc_c (iLevelB, iDct);
+  for (int i = 0; i < 16; i++)
+    EXPECT_EQ (iLevelA[i], iLevelB[i]);
+  FREE_MEMORY (iLevelA);
+  FREE_MEMORY (iLevelB);
+  FREE_MEMORY (iDct);
+}
+#endif
 TEST (EncodeMbAuxTest, TestScan_4x4_dcc) {
   CMemoryAlign cMemoryAlign (0);
   ALLOC_MEMORY (int16_t, iLevel, 16);
@@ -230,6 +263,29 @@
     iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
   WelsCalculateSingleCtr4x4_c (iDctC);
   WelsCalculateSingleCtr4x4_sse2 (iDctS);
+  for (int i = 0; i < 16; i++)
+    EXPECT_EQ (iDctC[i], iDctS[i]);
+  FREE_MEMORY (iDctC);
+  FREE_MEMORY (iDctS);
+}
+#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsDctT4_mmi) {
+  TestDctT4 (WelsDctT4_mmi);
+}
+
+TEST (EncodeMbAuxTest, WelsDctFourT4_mmi) {
+  TestDctFourT4 (WelsDctFourT4_mmi);
+}
+
+TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_mmi) {
+  CMemoryAlign cMemoryAlign (0);
+  ALLOC_MEMORY (int16_t, iDctC, 16);
+  ALLOC_MEMORY (int16_t, iDctS, 16);
+  for (int i = 0; i < 16; i++)
+    iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
+  WelsCalculateSingleCtr4x4_c (iDctC);
+  WelsCalculateSingleCtr4x4_mmi (iDctS);
   for (int i = 0; i < 16; i++)
     EXPECT_EQ (iDctC[i], iDctS[i]);
   FREE_MEMORY (iDctC);
--- a/test/processing/ProcessUT_VaaCalc.cpp
+++ b/test/processing/ProcessUT_VaaCalc.cpp
@@ -863,3 +863,11 @@
 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_AArch64_neon, 1, WELS_CPU_NEON)
 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_AArch64_neon, 1, WELS_CPU_NEON)
 #endif
+
+#if defined(HAVE_MMI)
+GENERATE_VAACalcSad_UT (VAACalcSad_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadVar_UT (VAACalcSadVar_mmi, 1, WELS_CPU_MMI)
+#endif