shithub: openh264

--- a/codec/common/targets.mk

+++ b/codec/common/targets.mk

@@ -64,7 +64,11 @@

 OBJS += $(COMMON_OBJSARM64)

 COMMON_ASM_MIPS_SRCS=\

+	$(COMMON_SRCDIR)/mips/copy_mb_mmi.c\

 	$(COMMON_SRCDIR)/mips/deblock_mmi.c\

+	$(COMMON_SRCDIR)/mips/expand_picture_mmi.c\

+	$(COMMON_SRCDIR)/mips/intra_pred_com_mmi.c\

+	$(COMMON_SRCDIR)/mips/satd_sad_mmi.c\

 COMMON_OBJSMIPS += $(COMMON_ASM_MIPS_SRCS:.c=.$(OBJ))

 ifeq ($(ASM_ARCH), mips)

--- a/codec/encoder/core/inc/decode_mb_aux.h

+++ b/codec/encoder/core/inc/decode_mb_aux.h

@@ -95,6 +95,11 @@

                                  int16_t* pDctDc);

 #endif

+#if defined(HAVE_MMI)

+void WelsIDctT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

+void WelsIDctFourT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

+void WelsIDctRecI16x16Dc_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc);

+#endif//HAVE_MMI

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/encoder/core/inc/encode_mb_aux.h

+++ b/codec/encoder/core/inc/encode_mb_aux.h

@@ -147,6 +147,33 @@

 void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);

 void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);

 #endif

+#ifdef HAVE_MMI

+int32_t WelsGetNoneZeroCount_mmi (int16_t* pLevel);

+/****************************************************************************

+ *  * Scan and Score functions

+ *   ****************************************************************************/

+void WelsScan4x4Ac_mmi (int16_t* zig_value, int16_t* pDct);

+void WelsScan4x4DcAc_mmi (int16_t* pLevel, int16_t* pDct);

+int32_t WelsCalculateSingleCtr4x4_mmi (int16_t* pDct);

+/****************************************************************************

+ *  * DCT functions

+ *   ****************************************************************************/

+void WelsDctT4_mmi (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

+void WelsDctFourT4_mmi (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

+/****************************************************************************

+ *  * HDM and Quant functions

+ *   ****************************************************************************/

+void WelsHadamardT4Dc_mmi (int16_t* pLumaDc, int16_t* pDct);

+void WelsQuant4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);

+void WelsQuant4x4Dc_mmi (int16_t* pDct, int16_t iFF, int16_t iMF);

+void WelsQuantFour4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);

+void WelsQuantFour4x4Max_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);

+#endif//HAVE_MMI

 #if defined(__cplusplus)

 #endif//__cplusplus

--- /dev/null

+++ b/codec/encoder/core/mips/dct_mmi.c

@@ -1,0 +1,529 @@

+/*!

+ * \copy

+ *     Copyright (c)  2009-2018, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ *

+ * \file    dct_mmi.c

+ *

+ * \brief   Loongson optimization

+ *

+ * \date    20/07/2018 Created

+ *

+ *************************************************************************************

+ */

+#include <stdint.h>

+#include "asmdefs_mmi.h"

+#define MMI_Load4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \

+  "gslqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \

+  "gslqc1     "#f10", "#f8", 0x10("#r0")      \n\t" \

+  "gslqc1     "#f18", "#f16", 0x20("#r0")     \n\t" \

+  "gslqc1     "#f6", "#f4", 0x30("#r0")       \n\t" \

+  MMI_XSawp_DQ(f8, f10, f4, f6, f12, f14)           \

+  MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)

+#define MMI_SumSubDiv2(f0, f2, f4, f6, f8, f10, f12, f14, f16) \

+  "mov.d      "#f8", "#f4"                    \n\t" \

+  "mov.d      "#f10", "#f6"                   \n\t" \

+  "psrah      "#f4", "#f4", "#f16"            \n\t" \

+  "psrah      "#f6", "#f6", "#f16"            \n\t" \

+  "psrah      "#f12", "#f0", "#f16"           \n\t" \

+  "psrah      "#f14", "#f2", "#f16"           \n\t" \

+  "paddh      "#f0", "#f0", "#f4"             \n\t" \

+  "paddh      "#f2", "#f2", "#f6"             \n\t" \

+  "psubh      "#f12", "#f12", "#f8"           \n\t" \

+  "psubh      "#f14", "#f14", "#f10"          \n\t"

+#define MMI_IDCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28) \

+  MMI_SumSub(f24, f26, f4, f6, f20, f22)                        \

+  MMI_SumSubDiv2(f0, f2, f8, f10, f16, f18, f12, f14, f28)      \

+  MMI_SumSub(f4, f6, f0, f2, f16, f18)                          \

+  MMI_SumSub(f24, f26, f12, f14, f16, f18)

+#define MMI_StoreDiff8p_6(f0, f2, f4, f6, f8, f12, r0, r1, f14) \

+  "paddh      "#f0", "#f0", "#f8"             \n\t" \

+  "paddh      "#f2", "#f2", "#f8"             \n\t" \

+  "psrah      "#f0", "#f0", "#f14"            \n\t" \

+  "psrah      "#f2", "#f2", "#f14"            \n\t" \

+  "gsldlc1    "#f4", 0x7("#r1")               \n\t" \

+  "gsldrc1    "#f4", 0x0("#r1")               \n\t" \

+  "punpckhbh  "#f6", "#f4", "#f12"            \n\t" \

+  "punpcklbh  "#f4", "#f4", "#f12"            \n\t" \

+  "paddsh     "#f4", "#f4", "#f0"             \n\t" \

+  "paddsh     "#f6", "#f6", "#f2"             \n\t" \

+  "packushb   "#f4", "#f4", "#f6"             \n\t" \

+  "gssdlc1    "#f4", 0x7("#r0")               \n\t" \

+  "gssdrc1    "#f4", 0x0("#r0")               \n\t"

+#define MMI_StoreDiff8p_5(f0, f2, f4, f6, f8, r0, r1, offset) \

+  "gsldlc1    "#f4", "#offset"+0x7("#r1")     \n\t" \

+  "gsldrc1    "#f4", "#offset"+0x0("#r1")     \n\t" \

+  "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \

+  "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \

+  "paddsh     "#f4", "#f4", "#f0"             \n\t" \

+  "paddsh     "#f6", "#f6", "#f2"             \n\t" \

+  "packushb   "#f4", "#f4", "#f6"             \n\t" \

+  "gssdlc1    "#f4", "#offset"+0x7("#r0")     \n\t" \

+  "gssdrc1    "#f4", "#offset"+0x0("#r0")     \n\t"

+#define MMI_Load8DC(f0, f2, f4, f6, f8, f10, f12, f14, f16, r0, offset, f20) \

+  "gslqc1     "#f2", "#f0", "#offset"+0x0("#r0") \n\t" \

+  "paddh      "#f0", "#f0", "#f16"               \n\t" \

+  "paddh      "#f2", "#f2", "#f16"               \n\t" \

+  "psrah      "#f0", "#f0", "#f20"               \n\t" \

+  "psrah      "#f2", "#f2", "#f20"               \n\t" \

+  "punpckhhw  "#f4", "#f0", "#f0"                \n\t" \

+  "punpckhwd  "#f6", "#f4", "#f4"                \n\t" \

+  "punpcklwd  "#f4", "#f4", "#f4"                \n\t" \

+  "punpcklhw  "#f8", "#f2", "#f2"                \n\t" \

+  "punpckhwd  "#f10", "#f8", "#f8"               \n\t" \

+  "punpcklwd  "#f8", "#f8", "#f8"                \n\t" \

+  "punpckhhw  "#f12", "#f2", "#f2"               \n\t" \

+  "punpckhwd  "#f14", "#f12", "#f12"             \n\t" \

+  "punpcklwd  "#f12", "#f12", "#f12"             \n\t" \

+  "punpcklhw  "#f0", "#f0", "#f0"                \n\t" \

+  "punpckhwd  "#f2", "#f0", "#f0"                \n\t" \

+  "punpcklwd  "#f0", "#f0", "#f0"                \n\t"

+#define MMI_StoreDiff4x8p(f0, f2, f4, f6, f8, f10, f12, r0, r1, r2, r3) \

+  MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0)         \

+  MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)         \

+  PTR_ADDU   ""#r0", "#r0", "#r2"                        \n\t" \

+  PTR_ADDU   ""#r1", "#r1", "#r3"                        \n\t" \

+  MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0)         \

+  MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)

+#define MMI_Load4Col(f0, f2, f4, f6, f8, r0, offset) \

+  "lh         $8, "#offset"("#r0")        \n\t" \

+  "dmtc1      $8, "#f0"                   \n\t" \

+  "lh         $8, "#offset"+0x20("#r0")   \n\t" \

+  "dmtc1      $8, "#f4"                   \n\t" \

+  "punpcklwd  "#f0", "#f0", "#f4"         \n\t" \

+  "lh         $8, "#offset"+0x80("#r0")   \n\t" \

+  "dmtc1      $8, "#f6"                   \n\t" \

+  "lh         $8, "#offset"+0xa0("#r0")   \n\t" \

+  "dmtc1      $8, "#f8"                   \n\t" \

+  "punpcklwd  "#f2", "#f6", "#f8"         \n\t"

+#define MMI_SumSubD(f0, f2, f4, f6, f8, f10) \

+  "mov.d      "#f8", "#f4"                \n\t" \

+  "mov.d      "#f10", "#f6"               \n\t" \

+  "paddw      "#f4", "#f4", "#f0"         \n\t" \

+  "paddw      "#f6", "#f6", "#f2"         \n\t" \

+  "psubw      "#f0", "#f0", "#f8"         \n\t" \

+  "psubw      "#f2", "#f2", "#f10"        \n\t"

+#define WELS_DD1(f0, f2, f_val_31) \

+  "pcmpeqh    "#f0", "#f0", "#f0"         \n\t" \

+  "pcmpeqh    "#f2", "#f2", "#f2"         \n\t" \

+  "psrlw      "#f0", "#f0", "#f_val_31"   \n\t" \

+  "psrlw      "#f2", "#f2", "#f_val_31"   \n\t"

+#define MMI_SumSubDiv2D(f0, f2, f4, f6, f8, f10, f12, f14, f_val_1) \

+  "paddw      "#f0", "#f0", "#f4"         \n\t" \

+  "paddw      "#f2", "#f2", "#f6"         \n\t" \

+  "paddw      "#f0", "#f0", "#f8"         \n\t" \

+  "paddw      "#f2", "#f2", "#f10"        \n\t" \

+  "psraw      "#f0", "#f0", "#f_val_1"    \n\t" \

+  "psraw      "#f2", "#f2", "#f_val_1"    \n\t" \

+  "mov.d      "#f12", "#f0"               \n\t" \

+  "mov.d      "#f14", "#f2"               \n\t" \

+  "psubw      "#f12", "#f12", "#f4"       \n\t" \

+  "psubw      "#f14", "#f14", "#f6"       \n\t"

+#define MMI_Trans4x4W(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \

+  MMI_XSawp_WD(f0, f2, f4, f6, f16, f18)  \

+  MMI_XSawp_WD(f8, f10, f12, f14, f4, f6) \

+  MMI_XSawp_DQ(f0, f2, f8, f10, f12, f14) \

+  MMI_XSawp_DQ(f16, f18, f4, f6, f8, f10)

+#define MMI_SumSubMul2(f0, f2, f4, f6, f8, f10) \

+  "mov.d      "#f8", "#f0"                    \n\t" \

+  "mov.d      "#f10", "#f2"                   \n\t" \

+  "paddh      "#f0", "#f0", "#f0"             \n\t" \

+  "paddh      "#f2", "#f2", "#f2"             \n\t" \

+  "paddh      "#f0", "#f0", "#f4"             \n\t" \

+  "paddh      "#f2", "#f2", "#f6"             \n\t" \

+  "psubh      "#f8", "#f8", "#f4"             \n\t" \

+  "psubh      "#f10", "#f10", "#f6"           \n\t" \

+  "psubh      "#f8", "#f8", "#f4"             \n\t" \

+  "psubh      "#f10", "#f10", "#f6"           \n\t"

+#define MMI_DCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22) \

+  MMI_SumSub(f20, f22, f8, f10, f16, f18)   \

+  MMI_SumSub(f0, f2, f4, f6, f16, f18)      \

+  MMI_SumSub(f8, f10, f4, f6, f16, f18)     \

+  MMI_SumSubMul2(f20, f22, f0, f2, f12, f14)

+#define MMI_Store4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \

+  MMI_XSawp_DQ(f0, f2, f4, f6, f16, f18)            \

+  MMI_XSawp_DQ(f8, f10, f12, f14, f4, f6)           \

+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \

+  "gssqc1     "#f10", "#f8", 0x10("#r0")      \n\t" \

+  "gssqc1     "#f18", "#f16", 0x20("#r0")     \n\t" \

+  "gssqc1     "#f6", "#f4", 0x30("#r0")       \n\t"

+#define MMI_LoadDiff4P_SINGLE(f0, f2, r0, r1, f4) \

+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \

+  "gsldlc1    "#f2", 0x7("#r1")               \n\t" \

+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \

+  "gsldrc1    "#f2", 0x0("#r1")               \n\t" \

+  "punpcklbh  "#f0", "#f0", "#f4"             \n\t" \

+  "punpcklbh  "#f2", "#f2", "#f4"             \n\t" \

+  "psubh      "#f0", "#f0", "#f2"             \n\t"

+#define MMI_LoadDiff4x4P_SINGLE(f0, f2, f4, f6, r0, r1, r2, r3, f8, f10) \

+  MMI_LoadDiff4P_SINGLE(f0, f8, r0, r2, f10)        \

+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \

+  PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \

+  MMI_LoadDiff4P_SINGLE(f2, f8, r0, r2, f10)        \

+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \

+  PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \

+  MMI_LoadDiff4P_SINGLE(f4, f8, r0, r2, f10)        \

+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \

+  PTR_ADDU   ""#r2", "#r2", "#r3"             \n\t" \

+  MMI_LoadDiff4P_SINGLE(f6, f8, r0, r2, f10)

+#define MMI_DCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \

+  MMI_SumSub_SINGLE(f6, f0, f10)     \

+  MMI_SumSub_SINGLE(f4, f2, f10)     \

+  MMI_SumSub_SINGLE(f4, f6, f10)     \

+  MMI_SumSubMul2_SINGLE(f0, f2, f8, f12)

+void WelsIDctT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,

+                       int32_t iPredStride, int16_t* pDct) {

+  __asm__ volatile (

+    ".set       arch=loongson3a                    \n\t"

+    "gsldlc1    $f0, 0x7(%[pDct])                  \n\t"

+    "gsldrc1    $f0, 0x0(%[pDct])                  \n\t"

+    "gsldlc1    $f2, 0xF(%[pDct])                  \n\t"

+    "gsldrc1    $f2, 0x8(%[pDct])                  \n\t"

+    "gsldlc1    $f4, 0x17(%[pDct])                 \n\t"

+    "gsldrc1    $f4, 0x10(%[pDct])                 \n\t"

+    "gsldlc1    $f6, 0x1F(%[pDct])                 \n\t"

+    "gsldrc1    $f6, 0x18(%[pDct])                 \n\t"

+    "dli        $8, 0x1                            \n\t"

+    "dmtc1      $8, $f16                           \n\t"

+    "dli        $8, 0x6                            \n\t"

+    "dmtc1      $8, $f18                           \n\t"

+    MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)

+    MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f16)

+    MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)

+    MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f16)

+    "xor        $f14, $f14, $f14                   \n\t"

+    "dli        $8, 0x0020                         \n\t"

+    "dmtc1      $8, $f12                           \n\t"

+    "punpcklhw  $f12, $f12, $f12                   \n\t"

+    "punpcklwd  $f12, $f12, $f12                   \n\t"

+    MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pRec], %[pPred], $f18)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pRec], %[pPred], $f18)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pRec], %[pPred], $f18)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pRec], %[pPred], $f18)

+    : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred)

+    : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride),

+      [pDct]"r"((short *)pDct)

+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+      "$f14", "$f16", "$f18"

+  );

+}

+void WelsIDctFourT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,

+                           int32_t iPredStride, int16_t* pDct) {

+  BACKUP_REG;

+  __asm__ volatile (

+    ".set       arch=loongson3a                    \n\t"

+    MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)

+    MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)

+    "dli        $8, 0x1                            \n\t"

+    "dmtc1      $8, $f30                           \n\t"

+    MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,

+             $f0, $f2, $f30)

+    MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)

+    MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,

+             $f4, $f6, $f30)

+    "xor        $f28, $f28, $f28                   \n\t"

+    "dli        $8, 0x6                            \n\t"

+    "dmtc1      $8, $f26                           \n\t"

+    "dli        $8, 0x0020                         \n\t"

+    "dmtc1      $8, $f24                           \n\t"

+    "punpcklhw  $f24, $f24, $f24                   \n\t"

+    "punpcklwd  $f24, $f24, $f24                   \n\t"

+    MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDIU  "%[pDct], %[pDct], 0x40             \n\t"

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)

+    MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)

+    MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,

+             $f0, $f2, $f30)

+    MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)

+    MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,

+             $f4, $f6, $f30)

+    "dli        $8, 0x6                            \n\t"

+    "dmtc1      $8, $f26                           \n\t"

+    "dli        $8, 0x0020                         \n\t"

+    "dmtc1      $8, $f24                           \n\t"

+    "punpcklhw  $f24, $f24, $f24                   \n\t"

+    "punpcklwd  $f24, $f24, $f24                   \n\t"

+    MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)

+    : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),

+      [pDct]"+&r"((short *)pDct)

+    : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)

+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"

+  );

+  RECOVER_REG;

+}

+void WelsIDctRecI16x16Dc_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,

+                             int32_t iPredStride, int16_t* pDct) {

+  BACKUP_REG;

+  __asm__ volatile (

+    ".set       arch=loongson3a                    \n\t"

+    "xor        $f28, $f28, $f28                   \n\t"

+    "dli        $8, 0x0020                         \n\t"

+    "dmtc1      $8, $f24                           \n\t"

+    "punpcklhw  $f24, $f24, $f24                   \n\t"

+    "punpcklwd  $f24, $f24, $f24                   \n\t"

+    "dli        $8, 0x6                            \n\t"

+    "dmtc1      $8, $f30                           \n\t"

+    MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24,

+                %[pDct], 0x0, $f30)

+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24, %[pDct], 0x10, $f30)

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    PTR_ADDU   "%[pRec], %[pRec], %[iStride]       \n\t"

+    PTR_ADDU   "%[pPred], %[pPred], %[iPredStride] \n\t"

+    MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],

+                      %[pPred], %[iStride], %[iPredStride])

+    : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),

+      [pDct]"+&r"((short *)pDct)

+    : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)

+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+      "$f14", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"

+  );

+  RECOVER_REG;

+}

+void WelsHadamardT4Dc_mmi( int16_t *luma_dc, int16_t *pDct) {

+  BACKUP_REG;

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    MMI_Load4Col($f4, $f6, $f20, $f24, $f0, %[pDct], 0x0)

+    MMI_Load4Col($f8, $f10, $f20, $f24, $f0, %[pDct], 0x40)

+    MMI_Load4Col($f12, $f14, $f20, $f24, $f0, %[pDct], 0x100)

+    MMI_Load4Col($f16, $f18, $f20, $f24, $f0, %[pDct], 0x140)

+    MMI_SumSubD($f4, $f6, $f8, $f10, $f28, $f30)

+    MMI_SumSubD($f12, $f14, $f16, $f18, $f28, $f30)

+    MMI_SumSubD($f8, $f10, $f16, $f18, $f28, $f30)

+    MMI_SumSubD($f4, $f6, $f12, $f14, $f28, $f30)

+    MMI_Trans4x4W($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f20, $f22)

+    MMI_SumSubD($f16, $f18, $f12, $f14, $f28, $f30)

+    MMI_SumSubD($f20, $f22, $f4, $f6, $f28, $f30)

+    "dli        $8, 0x1F                        \n\t"

+    "dmtc1      $8, $f30                        \n\t"

+    WELS_DD1($f24, $f26, $f30)

+    "dli        $8, 0x1                         \n\t"

+    "dmtc1      $8, $f30                        \n\t"

+    MMI_SumSubDiv2D($f12, $f14, $f4, $f6, $f24, $f26, $f0, $f2, $f30)

+    MMI_SumSubDiv2D($f16, $f18, $f20, $f22, $f24, $f26, $f4, $f6, $f30)

+    MMI_Trans4x4W($f12, $f14, $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10)

+    "packsswh   $f12, $f12, $f14                \n\t"

+    "packsswh   $f14, $f16, $f18                \n\t"

+    "packsswh   $f8, $f8, $f10                  \n\t"

+    "packsswh   $f10, $f4, $f6                  \n\t"

+    "gssqc1     $f14, $f12, 0x0(%[luma_dc])     \n\t"

+    "gssqc1     $f10, $f8, 0x10(%[luma_dc])     \n\t"

+   :

+   : [luma_dc]"r"((short *)luma_dc), [pDct]"r"((short *)pDct)

+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"

+  );

+  RECOVER_REG;

+}

+void WelsDctT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,

+                   uint8_t *pix2, int32_t i_pix2 ) {

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    "xor        $f14, $f14, $f14                \n\t"

+    "dli        $8, 0x1                         \n\t"

+    "dmtc1      $8, $f16                        \n\t"

+    MMI_LoadDiff4x4P_SINGLE($f2, $f4, $f6, $f8, %[pix1], %[i_pix1],

+                            %[pix2], %[i_pix2], $f0, $f14)

+    MMI_DCT_SINGLE($f2, $f4, $f6, $f8, $f10, $f12, $f16)

+    MMI_Trans4x4H_SINGLE($f6, $f2, $f8, $f10, $f4)

+    MMI_DCT_SINGLE($f6, $f10, $f4, $f8, $f2, $f12, $f16)

+    MMI_Trans4x4H_SINGLE($f4, $f6, $f8, $f2, $f10)

+    "gssdlc1    $f4, 0x7(%[pDct])               \n\t"

+    "gssdlc1    $f2, 0xF(%[pDct])               \n\t"

+    "gssdlc1    $f10, 0x17(%[pDct])             \n\t"

+    "gssdlc1    $f8, 0x1F(%[pDct])              \n\t"

+    "gssdrc1    $f4, 0x0(%[pDct])               \n\t"

+    "gssdrc1    $f2, 0x8(%[pDct])               \n\t"

+    "gssdrc1    $f10, 0x10(%[pDct])             \n\t"

+    "gssdrc1    $f8, 0x18(%[pDct])              \n\t"

+   : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)

+   : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)

+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+     "$f14", "$f16"

+  );

+}

+void WelsDctFourT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,

+                       uint8_t *pix2, int32_t i_pix2 ) {

+  BACKUP_REG;

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    "xor        $f28, $f28, $f28                \n\t"

+    MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])

+    MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)

+    MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)

+    MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)

+    MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)

+    MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])

+    PTR_ADDU   "%[pix1], %[pix1], %[i_pix1]     \n\t"

+    PTR_ADDU   "%[pix2], %[pix2], %[i_pix2]     \n\t"

+    MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])

+    MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)

+    MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)

+    MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)

+    MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)

+    PTR_ADDIU  "%[pDct], %[pDct], 0x40          \n\t"

+    MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)

+   : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)

+   : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)

+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"

+  );

+  RECOVER_REG;

+}

--- /dev/null

+++ b/codec/encoder/core/mips/quant_mmi.c

@@ -1,0 +1,553 @@

+/*!

+ * \copy

+ *     Copyright (c)  2009-2018, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ *

+ * \file    quant_mmi.c

+ *

+ * \brief   Loongson optimization

+ *

+ * \date    20/07/2018 Created

+ *

+ *************************************************************************************

+ */

+#include <stdint.h>

+#include "asmdefs_mmi.h"

+void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    "xor        $f10, $f10, $f10                \n\t"

+    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"

+    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+   :

+   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)

+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"

+  );

+}

+void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    "xor        $f10, $f10, $f10                \n\t"

+    "dmtc1      %[mf], $f12                     \n\t"

+    "pshufh     $f12, $f12, $f10                \n\t"

+    "dmtc1      %[ff], $f8                      \n\t"

+    "pshufh     $f8, $f8, $f10                  \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f8                   \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f12                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f8                   \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f12                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+   :

+   : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)

+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"

+  );

+}

+void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"

+    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"

+   :

+   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)

+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"

+  );

+}

+void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,

+                             const int16_t *mf, int16_t *max) {

+  BACKUP_REG;

+  __asm__ volatile (

+    ".set       arch=loongson3a                 \n\t"

+    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"

+    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"

+    "xor        $f16, $f16, $f16                \n\t"

+    "xor        $f18, $f18, $f18                \n\t"

+    "xor        $f20, $f20, $f20                \n\t"

+    "xor        $f22, $f22, $f22                \n\t"

+    "xor        $f24, $f24, $f24                \n\t"

+    "xor        $f26, $f26, $f26                \n\t"

+    "xor        $f28, $f28, $f28                \n\t"

+    "xor        $f30, $f30, $f30                \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f16, $f16, $f0                 \n\t"

+    "pmaxsh     $f18, $f18, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f16, $f16, $f0                 \n\t"

+    "pmaxsh     $f18, $f18, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f20, $f20, $f0                 \n\t"

+    "pmaxsh     $f22, $f22, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f20, $f20, $f0                 \n\t"

+    "pmaxsh     $f22, $f22, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f24, $f24, $f0                 \n\t"

+    "pmaxsh     $f26, $f26, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f24, $f24, $f0                 \n\t"

+    "pmaxsh     $f26, $f26, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f28, $f28, $f0                 \n\t"

+    "pmaxsh     $f30, $f30, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"

+    "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"

+    "xor        $f4, $f4, $f4                   \n\t"

+    "xor        $f6, $f6, $f6                   \n\t"

+    "pcmpgth    $f4, $f4, $f0                   \n\t"

+    "pcmpgth    $f6, $f6, $f2                   \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "paddush    $f0, $f0, $f8                   \n\t"

+    "paddush    $f2, $f2, $f10                  \n\t"

+    "pmulhuh    $f0, $f0, $f12                  \n\t"

+    "pmulhuh    $f2, $f2, $f14                  \n\t"

+    "pmaxsh     $f28, $f28, $f0                 \n\t"

+    "pmaxsh     $f30, $f30, $f2                 \n\t"

+    "xor        $f0, $f0, $f4                   \n\t"

+    "xor        $f2, $f2, $f6                   \n\t"

+    "psubh      $f0, $f0, $f4                   \n\t"

+    "psubh      $f2, $f2, $f6                   \n\t"

+    "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"

+    "mov.d      $f0, $f18                       \n\t"

+    "punpckhhw  $f18, $f16, $f20                \n\t"

+    "punpcklhw  $f16, $f16, $f20                \n\t"

+    "punpckhhw  $f2, $f0, $f22                  \n\t"

+    "punpcklhw  $f0, $f0, $f22                  \n\t"

+    "mov.d      $f20, $f26                      \n\t"

+    "punpckhhw  $f26, $f24, $f28                \n\t"

+    "punpcklhw  $f24, $f24, $f28                \n\t"

+    "punpckhhw  $f22, $f20, $f30                \n\t"

+    "punpcklhw  $f20, $f20, $f30                \n\t"

+    "mov.d      $f28, $f18                      \n\t"

+    "punpckhwd  $f18, $f16, $f24                \n\t"

+    "punpcklwd  $f16, $f16, $f24                \n\t"

+    "punpckhwd  $f30, $f28, $f26                \n\t"

+    "punpcklwd  $f28, $f28, $f26                \n\t"

+    "mov.d      $f24, $f2                       \n\t"

+    "punpckhwd  $f2, $f0, $f20                  \n\t"

+    "punpcklwd  $f0, $f0, $f20                  \n\t"

+    "punpckhwd  $f26, $f24, $f22                \n\t"

+    "punpcklwd  $f24, $f24, $f22                \n\t"

+    "mov.d      $f20, $f18                      \n\t"

+    "mov.d      $f18, $f0                       \n\t"

+    "mov.d      $f22, $f2                       \n\t"

+    "mov.d      $f0, $f30                       \n\t"

+    "mov.d      $f30, $f24                      \n\t"

+    "mov.d      $f2, $f26                       \n\t"

+    "pmaxsh     $f0, $f0, $f16                  \n\t"

+    "pmaxsh     $f2, $f2, $f18                  \n\t"

+    "pmaxsh     $f0, $f0, $f20                  \n\t"

+    "pmaxsh     $f2, $f2, $f22                  \n\t"

+    "pmaxsh     $f0, $f0, $f28                  \n\t"

+    "pmaxsh     $f2, $f2, $f30                  \n\t"

+    "mov.d      $f4, $f0                        \n\t"

+    "mov.d      $f6, $f2                        \n\t"

+    "mov.d      $f0, $f2                        \n\t"

+    "mov.d      $f2, $f6                        \n\t"

+    "pmaxsh     $f0, $f0, $f4                   \n\t"

+    "pmaxsh     $f2, $f2, $f6                   \n\t"

+    "gssdlc1    $f0, 0x7(%[max])                \n\t"

+    "gssdrc1    $f0, 0x0(%[max])                \n\t"

+   :

+   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),

+     [max]"r"((short *)max)

+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",

+     "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"

+  );

+  RECOVER_REG;

+}

--- /dev/null

+++ b/codec/encoder/core/mips/score_mmi.c

@@ -1,0 +1,324 @@

+/*!

+ * \copy

+ *     Copyright (c)  2009-2018, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ *

+ * \file    score_mmi.c

+ *

+ * \brief   Loongson optimization

+ *

+ * \date    21/07/2018 Created

+ *

+ *************************************************************************************

+ */

+#include <stdint.h>

+#include "asmdefs_mmi.h"

+unsigned char nozero_count_table[] __attribute__((aligned(16))) = {

+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,

+    2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,

+    2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,

+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,

+    4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,

+    3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,

+    4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,

+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};

+int32_t WelsGetNoneZeroCount_mmi(int16_t *level) {

+  int ret_val = 0;

+  __asm__ volatile(

+    ".set       arch=loongson3a                 \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[level])         \n\t"

+    "gslqc1     $f6, $f4, 0x10(%[level])        \n\t"

+    "xor        $f8, $f8, $f8                   \n\t"

+    "pcmpeqh    $f0, $f0, $f8                   \n\t"

+    "pcmpeqh    $f2, $f2, $f8                   \n\t"

+    "pcmpeqh    $f4, $f4, $f8                   \n\t"

+    "pcmpeqh    $f6, $f6, $f8                   \n\t"

+    "packsshb   $f4, $f4, $f6                   \n\t"

+    "packsshb   $f6, $f0, $f2                   \n\t"

+    "pmovmskb   $f0, $f4                        \n\t"

+    "pmovmskb   $f2, $f6                        \n\t"

+    "dmfc1      $8, $f0                         \n\t"

+    "dmfc1      $9, $f2                         \n\t"

+    "xor        $8, 0xFF                        \n\t"

+    "xor        $9, 0xFF                        \n\t"

+    PTR_ADDU   "$10, $8, %[nozero_count_table]  \n\t"

+    "lbu        $8, 0x0($10)                    \n\t"

+    PTR_ADDU   "$10, $9, %[nozero_count_table]  \n\t"

+    "lbu        $9, 0x0($10)                    \n\t"

+    PTR_ADDU   "%[ret_val], $8, $9              \n\t"

+    : [ret_val] "=r"((int)ret_val)

+    : [level] "r"((unsigned char *)level),

+      [nozero_count_table] "r"((unsigned char *)nozero_count_table)

+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8"

+  );

+  return ret_val;

+}

+void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) {

+  BACKUP_REG;

+  __asm__ volatile(

+    ".set       arch=loongson3a                 \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"

+    "dli        $8, 0x3                         \n\t"

+    "dmtc1      $8, $f22                        \n\t"

+    "dli        $8, 0x2                         \n\t"

+    "dmtc1      $8, $f24                        \n\t"

+    "dli        $8, 0x1                         \n\t"

+    "dmtc1      $8, $f26                        \n\t"

+    "dmtc1      $0, $f28                        \n\t"

+    "pextrh     $f18, $f2, $f22                 \n\t"

+    "pextrh     $f20, $f4, $f24                 \n\t"

+    "pextrh     $f16, $f2, $f26                 \n\t"

+    "pinsrh_2   $f4, $f4, $f18                  \n\t"

+    "pinsrh_3   $f2, $f2, $f16                  \n\t"

+    "pextrh     $f18, $f4, $f28                 \n\t"

+    "pinsrh_1   $f2, $f2, $f18                  \n\t"

+    "pinsrh_0   $f4, $f4, $f20                  \n\t"

+    "dli        $8, 0x93                        \n\t"

+    "dmtc1      $8, $f22                        \n\t"

+    "dli        $8, 0x39                        \n\t"

+    "dmtc1      $8, $f24                        \n\t"

+    "punpckhwd  $f10, $f0, $f2                  \n\t"

+    "punpcklwd  $f8, $f0, $f2                   \n\t"

+    "punpckhwd  $f14, $f4, $f6                  \n\t"

+    "punpcklwd  $f12, $f4, $f6                  \n\t"

+    "mov.d      $f0, $f8                        \n\t"

+    "pshufh     $f2, $f10, $f22                 \n\t"

+    "pshufh     $f4, $f12, $f24                 \n\t"

+    "mov.d      $f6, $f14                       \n\t"

+    "gssqc1     $f2, $f0, 0x0(%[level])         \n\t"

+    "gssqc1     $f6, $f4, 0x10(%[level])        \n\t"

+    :

+    : [level] "r"((short *)level), [pDct] "r"((short *)pDct)

+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"

+  );

+  RECOVER_REG;

+}

+void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) {

+  BACKUP_REG;

+  __asm__ volatile(

+    ".set       arch=loongson3a                 \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"

+    "mov.d      $f8, $f2                        \n\t"

+    "mov.d      $f2, $f4                        \n\t"

+    "mov.d      $f10, $f6                       \n\t"

+    "mov.d      $f12, $f2                       \n\t"

+    "punpckhwd  $f2, $f0, $f8                   \n\t"

+    "punpcklwd  $f0, $f0, $f8                   \n\t"

+    "punpckhwd  $f14, $f12, $f10                \n\t"

+    "punpcklwd  $f12, $f12, $f10                \n\t"

+    "dmtc1      $0, $f20                        \n\t"

+    "dli        $8, 0x10                        \n\t"

+    "dmtc1      $8, $f22                        \n\t"

+    "dli        $8, 0x30                        \n\t"

+    "dmtc1      $8, $f24                        \n\t"

+    "dli        $8, 0x3                         \n\t"

+    "dmtc1      $8, $f26                        \n\t"

+    "dli        $8, 0x93                        \n\t"

+    "dmtc1      $8, $f28                        \n\t"

+    "dli        $8, 0x39                        \n\t"

+    "dmtc1      $8, $f30                        \n\t"

+    "pextrh     $f16, $f0, $f26                 \n\t"

+    "pextrh     $f18, $f2, $f26                 \n\t"

+    "pinsrh_3   $f2, $f2, $f16                  \n\t"

+    "pextrh     $f16, $f14, $f20                \n\t"

+    "pinsrh_0   $f14, $f14, $f18                \n\t"

+    "pextrh     $f18, $f12, $f20                \n\t"

+    "pinsrh_0   $f12, $f12, $f16                \n\t"

+    "pinsrh_3   $f0, $f0, $f18                  \n\t"

+    "mov.d      $f4, $f0                        \n\t"

+    "pshufh     $f6, $f2, $f28                  \n\t"

+    "pshufh     $f8, $f12, $f30                 \n\t"

+    "mov.d      $f10, $f14                      \n\t"

+    "mov.d      $f12, $f8                       \n\t"

+    "mov.d      $f14, $f10                      \n\t"

+    "dsrl       $f4, $f4, $f22                  \n\t"

+    "pinsrh_3   $f4, $f4, $f6                   \n\t"

+    "dsrl       $f6, $f6, $f22                  \n\t"

+    "dsll       $f14, $f12, $f24                \n\t"

+    "xor        $f12, $f12, $f12                \n\t"

+    "or         $f4, $f4, $f12                  \n\t"

+    "or         $f6, $f6, $f14                  \n\t"

+    "dsrl       $f8, $f8, $f22                  \n\t"

+    "pinsrh_3   $f8, $f8, $f10                  \n\t"

+    "dsrl       $f10, $f10, $f22                \n\t"

+    "gssqc1     $f6, $f4, 0x0(%[zig_value])     \n\t"

+    "gssqc1     $f10, $f8, 0x10(%[zig_value])   \n\t"

+    :

+    : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct)

+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",

+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"

+  );

+  RECOVER_REG;

+}

+unsigned char i_ds_table[]__attribute__((aligned(16))) = {

+      3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

+unsigned char high_mask_table[]__attribute__((aligned(16))) = {

+      0, 0, 0, 3, 0, 2, 3, 6, 0, 2,

+      2, 5, 3, 5, 6, 9, 0, 1, 2, 5,

+      2, 4, 5, 8, 3, 5, 5, 8, 6, 8,

+      9,12, 0, 1, 1, 4, 2, 4, 5, 8,

+      2, 4, 4, 7, 5, 7, 8,11, 3, 4,

+      5, 8, 5, 7, 8,11, 6, 8, 8,11,

+      9,11,12,15, 0, 1, 1, 4, 1, 3,

+      4, 7, 2, 4, 4, 7, 5, 7, 8,11,

+      2, 3, 4, 7, 4, 6, 7,10, 5, 7,

+      7,10, 8,10,11,14, 3, 4, 4, 7,

+      5, 7, 8,11, 5, 7, 7,10, 8,10,

+     11,14, 6, 7, 8,11, 8,10,11,14,

+      9,11,11,14,12,14,15,18, 0, 0,

+      1, 4, 1, 3, 4, 7, 1, 3, 3, 6,

+      4, 6, 7,10, 2, 3, 4, 7, 4, 6,

+      7,10, 5, 7, 7,10, 8,10,11,14,

+      2, 3, 3, 6, 4, 6, 7,10, 4, 6,

+      6, 9, 7, 9,10,13, 5, 6, 7,10,

+      7, 9,10,13, 8,10,10,13,11,13,

+     14,17, 3, 4, 4, 7, 4, 6, 7,10,

+      5, 7, 7,10, 8,10,11,14, 5, 6,

+      7,10, 7, 9,10,13, 8,10,10,13,

+     11,13,14,17, 6, 7, 7,10, 8,10,

+     11,14, 8,10,10,13,11,13,14,17,

+      9,10,11,14,11,13,14,17,12,14,

+     14,17,15,17,18,21};

+unsigned char low_mask_table[]__attribute__((aligned(16))) = {

+      0, 3, 2, 6, 2, 5, 5, 9, 1, 5,

+      4, 8, 5, 8, 8,12, 1, 4, 4, 8,

+      4, 7, 7,11, 4, 8, 7,11, 8,11,

+     11,15, 1, 4, 3, 7, 4, 7, 7,11,

+      3, 7, 6,10, 7,10,10,14, 4, 7,

+      7,11, 7,10,10,14, 7,11,10,14,

+     11,14,14,18, 0, 4, 3, 7, 3, 6,

+      6,10, 3, 7, 6,10, 7,10,10,14,

+      3, 6, 6,10, 6, 9, 9,13, 6,10,

+      9,13,10,13,13,17, 4, 7, 6,10,

+      7,10,10,14, 6,10, 9,13,10,13,

+     13,17, 7,10,10,14,10,13,13,17,

+     10,14,13,17,14,17,17,21, 0, 3,

+      3, 7, 3, 6, 6,10, 2, 6, 5, 9,

+      6, 9, 9,13, 3, 6, 6,10, 6, 9,

+      9,13, 6,10, 9,13,10,13,13,17,

+      3, 6, 5, 9, 6, 9, 9,13, 5, 9,

+      8,12, 9,12,12,16, 6, 9, 9,13,

+      9,12,12,16, 9,13,12,16,13,16,

+     16,20, 3, 7, 6,10, 6, 9, 9,13,

+      6,10, 9,13,10,13,13,17, 6, 9,

+      9,13, 9,12,12,16, 9,13,12,16,

+     13,16,16,20, 7,10, 9,13,10,13,

+     13,17, 9,13,12,16,13,16,16,20,

+     10,13,13,17,13,16,16,20,13,17,

+     16,20,17,20,20,24};

+int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) {

+  int32_t iSingleCtr = 0;

+  __asm__ volatile(

+    ".set       arch=loongson3a                 \n\t"

+    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

+    "gslqc1     $f6, $f4, 0x10(%[pDct])         \n\t"

+    "packsshb   $f0, $f0, $f2                   \n\t"

+    "packsshb   $f2, $f4, $f6                   \n\t"

+    "xor        $f10, $f10, $f10                \n\t"

+    "xor        $f8, $f8, $f8                   \n\t"

+    "pcmpeqb    $f0, $f0, $f8                   \n\t"

+    "pcmpeqb    $f2, $f2, $f8                   \n\t"

+    "pmovmskb   $f10, $f0                       \n\t"

+    "pmovmskb   $f12, $f2                       \n\t"

+    "punpcklbh  $f10, $f10, $f12                \n\t"

+    "dmfc1      $12, $f10                       \n\t"

+    "dli        $8, 0xffff                      \n\t"

+    "xor        $12, $12, $8                    \n\t"

+    "xor        %[pDct], %[pDct], %[pDct]       \n\t"

+    "dli        $8, 0x80                        \n\t"

+    "dli        $9, 0x7                         \n\t"

+    "dli        $10, 0x100                      \n\t"

+    "dli        $11, 0x8                        \n\t"

+    "1:                                         \n\t"

+    "and        $13, $12, $8                    \n\t"

+    "bnez       $13, 2f                         \n\t"

+    "nop                                        \n\t"

+    "daddiu     $9, -0x1                        \n\t"

+    "dsrl       $8, 1                           \n\t"

+    "bnez       $9, 1b                          \n\t"

+    "nop                                        \n\t"

+    "2:                                         \n\t"

+    "and        $13, $12, $10                   \n\t"

+    "bnez       $13, 3f                         \n\t"

+    "nop                                        \n\t"

+    "daddiu     $11, 0x1                        \n\t"

+    "dsll       $10, 1                          \n\t"

+    "daddiu     $13, $11, -0x10                 \n\t"

+    "bltz       $13, 2b                         \n\t"

+    "nop                                        \n\t"

+    "3:                                         \n\t"

+    "dsubu      $11, $11, $9                    \n\t"

+    "daddiu     $11, -0x1                       \n\t"

+    PTR_ADDU   "$8, %[i_ds_table], $11          \n\t"

+    "lb         $10, 0x0($8)                    \n\t"

+    PTR_ADDU   "%[pDct], %[pDct], $10           \n\t"

+    "move       $11, $12                        \n\t"

+    "dli        $10, 0xff                       \n\t"

+    "and        $12, $10                        \n\t"

+    "dsrl       $11, 0x8                        \n\t"

+    "and        $11, $10                        \n\t"

+    PTR_ADDU   "$8, %[low_mask_table], $12      \n\t"

+    "lb         $10, 0x0($8)                    \n\t"

+    PTR_ADDU   "%[pDct], %[pDct], $10           \n\t"

+    PTR_ADDU   "$8, %[high_mask_table], $11     \n\t"

+    "lb         $10, 0x0($8)                    \n\t"

+    PTR_ADDU   "%[iSingleCtr], %[pDct], $10     \n\t"

+    : [iSingleCtr] "=r"(iSingleCtr)

+    : [pDct] "r"((short *)pDct),

+      [i_ds_table] "r"((unsigned char *)i_ds_table),

+      [high_mask_table] "r"((unsigned char *)high_mask_table),

+      [low_mask_table] "r"((unsigned char *)low_mask_table)

+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",

+      "$f6", "$f8", "$f10", "$f12"

+  );

+  return iSingleCtr;

+}

--- a/codec/encoder/core/src/decode_mb_aux.cpp

+++ b/codec/encoder/core/src/decode_mb_aux.cpp

@@ -302,5 +302,13 @@

     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_AArch64_neon;

 #endif

+#if defined(HAVE_MMI)

+  if (uiCpuFlag & WELS_CPU_MMI) {

+    pFuncList->pfIDctT4         = WelsIDctT4Rec_mmi;

+    pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_mmi;

+    pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_mmi;

+  }

+#endif//HAVE_MMI

--- a/codec/encoder/core/src/encode_mb_aux.cpp

+++ b/codec/encoder/core/src/encode_mb_aux.cpp

@@ -592,9 +592,24 @@

     pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_mmi;

     pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmi;

+    pFuncList->pfGetNoneZeroCount       = WelsGetNoneZeroCount_mmi;

+    pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;

+    pFuncList->pfQuantization4x4        = WelsQuant4x4_mmi;

+    pFuncList->pfQuantizationDc4x4      = WelsQuant4x4Dc_mmi;

+    pFuncList->pfQuantizationFour4x4    = WelsQuantFour4x4_mmi;

+    pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;

     pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_mmi;

     pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_mmi;

     pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_mmi;

+    pFuncList->pfScan4x4                = WelsScan4x4DcAc_mmi;

+    pFuncList->pfScan4x4Ac              = WelsScan4x4Ac_mmi;

+    pFuncList->pfCalculateSingleCtr4x4  = WelsCalculateSingleCtr4x4_mmi;

+    pFuncList->pfDctT4                  = WelsDctT4_mmi;

+    pFuncList->pfDctFourT4              = WelsDctFourT4_mmi;

 #endif//HAVE_MMI

--- a/codec/encoder/targets.mk

+++ b/codec/encoder/targets.mk

@@ -79,10 +79,24 @@

 endif

 OBJS += $(ENCODER_OBJSARM64)

+ENCODER_ASM_MIPS_SRCS=\

+	$(ENCODER_SRCDIR)/core/mips/dct_mmi.c\

+	$(ENCODER_SRCDIR)/core/mips/quant_mmi.c\

+	$(ENCODER_SRCDIR)/core/mips/score_mmi.c\

+ENCODER_OBJSMIPS += $(ENCODER_ASM_MIPS_SRCS:.c=.$(OBJ))

+ifeq ($(ASM_ARCH), mips)

+ENCODER_OBJS += $(ENCODER_OBJSMIPS)

+endif

+OBJS += $(ENCODER_OBJSMIPS)

 OBJS += $(ENCODER_OBJS)

 $(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp

 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<

+$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.c

+	$(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<

 $(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.asm

 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $@ $<

--- a/test/common/targets.mk

+++ b/test/common/targets.mk

@@ -2,8 +2,8 @@

 COMMON_UNITTEST_CPP_SRCS=\

 	$(COMMON_UNITTEST_SRCDIR)/CWelsListTest.cpp\

 	$(COMMON_UNITTEST_SRCDIR)/ExpandPicture.cpp\

-	$(COMMON_UNITTEST_SRCDIR)/WelsThreadPoolTest.cpp\

 	$(COMMON_UNITTEST_SRCDIR)/WelsTaskListTest.cpp\

+	$(COMMON_UNITTEST_SRCDIR)/WelsThreadPoolTest.cpp\

 COMMON_UNITTEST_OBJS += $(COMMON_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))

--- a/test/encoder/EncUT_DecodeMbAux.cpp

+++ b/test/encoder/EncUT_DecodeMbAux.cpp

@@ -246,6 +246,11 @@

 #endif

 #endif

+#if defined(HAVE_MMI)

+TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmi) {

+  TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmi);

+}

+#endif

 template<typename clip_t>

 void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {

   WelsIDctT4Anchor<clip_t> (&p_dst[0],                   dct[0]);

@@ -367,6 +372,42 @@

                                           14); //2^14 limit, (2^15+32) will cause overflow for SSE2.

     WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);

     WelsIDctRecI16x16Dc_sse2 (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);

+    int ok = -1;

+    for (int i = 0; i < 16; i++) {

+      for (int j = 0; j < 16; j++) {

+        if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {

+          ok = i * 16 + j;

+          break;

+        }

+      }

+    }

+    EXPECT_EQ (ok, -1);

+  }

+}

+#endif

+#if defined(HAVE_MMI)

+TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_mmi) {

+  TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_mmi);

+}

+TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_mmi) {

+  int32_t iCpuCores = 0;

+  uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  if (uiCpuFeatureFlag & WELS_CPU_MMI) {

+    uint8_t iRefDst[16 * FDEC_STRIDE];

+    int16_t iRefDct[4][4];

+    ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);

+    ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);

+    ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);

+    for (int i = 0; i < 16; i++)

+      for (int j = 0; j < 16; j++)

+        iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;

+    for (int i = 0; i < 4; i++)

+      for (int j = 0; j < 4; j++)

+        iRefDct[i][j] = iDct[i * 4 + j] = (rand() & ((1 << 15) - 1)) - (1 <<

+                                          14); //2^14 limit, (2^15+32) will cause overflow for SSE2.

+    WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);

+    WelsIDctRecI16x16Dc_mmi (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);

     int ok = -1;

     for (int i = 0; i < 16; i++) {

       for (int j = 0; j < 16; j++) {

--- a/test/encoder/EncUT_EncoderMbAux.cpp

+++ b/test/encoder/EncUT_EncoderMbAux.cpp

@@ -315,6 +315,11 @@

     TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);

 #endif

+#ifdef HAVE_MMI

+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_mmi) {

+  TestGetNoneZeroCount (WelsGetNoneZeroCount_mmi);

+}

+#endif

 #define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)

 #define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16

 #define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))

@@ -478,6 +483,24 @@

 #endif //HAVE_AVX2

 #endif

+#ifdef HAVE_MMI

+TEST (EncodeMbAuxTest, WelsQuant4x4_mmi) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)

+    TestWelsQuant4x4 (WelsQuant4x4_mmi);

+}

+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_mmi) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)

+    TestWelsQuant4x4Dc (WelsQuant4x4Dc_mmi);

+}

+TEST (EncodeMbAuxTest, WelsQuantFour4x4_mmi) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)

+    TestWelsQuantFour4x4 (WelsQuantFour4x4_mmi);

+}

+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_mmi) {

+  if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)

+    TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_mmi);

+}

+#endif //HAVE_MMI

 int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff,  int16_t mf) {

   int16_t pDct[4], s[4];

   int16_t threshold = ((1 << 16) - 1) / mf - ff;

@@ -604,6 +627,23 @@

     iDct[i] = (rand() & 32767) - 16384;

   WelsHadamardT4Dc_c (iLumaDcC, iDct);

   WelsHadamardT4Dc_sse2 (iLumaDcS, iDct);

+  for (int i = 0; i < 16; i++)

+    EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);

+  FREE_MEMORY (iDct);

+  FREE_MEMORY (iLumaDcC);

+  FREE_MEMORY (iLumaDcS);

+}

+#endif

+#ifdef HAVE_MMI

+TEST (EncodeMbAuxTest, WelsHadamardT4Dc_mmi) {

+  CMemoryAlign cMemoryAlign (0);

+  ALLOC_MEMORY (int16_t, iDct, 128 * 16);

+  ALLOC_MEMORY (int16_t, iLumaDcC, 16);

+  ALLOC_MEMORY (int16_t, iLumaDcS, 16);

+  for (int i = 0; i < 128 * 16; i++)

+    iDct[i] = (rand() & 32767) - 16384;

+  WelsHadamardT4Dc_c (iLumaDcC, iDct);

+  WelsHadamardT4Dc_mmi (iLumaDcS, iDct);

   for (int i = 0; i < 16; i++)

     EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);

   FREE_MEMORY (iDct);

--- a/test/encoder/targets.mk

+++ b/test/encoder/targets.mk

@@ -17,8 +17,8 @@

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_ParameterSetStrategy.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\

-	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SliceBufferReallocate.cpp\

+	$(ENCODER_UNITTEST_SRCDIR)/EncUT_SVC_me.cpp\

 ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))

--

⑨