ref: 241e9b775777a8dba2fa370f7fb73afd88aa6859
parent: c7dbf53be0a9eea50e88a7e4ad1422c10f4c8000
parent: 2795c6e7251e8cadf7d27e65f2fbde6e3a67363d
author: guangwei <GuangweiWang@users.noreply.github.com>
date: Sun Sep 30 13:17:24 EDT 2018
Merge pull request #3005 from gxw-loongson/master Add support for loongson platform
--- a/build/arch.mk
+++ b/build/arch.mk
@@ -29,3 +29,15 @@
CFLAGS += -DHAVE_NEON_AARCH64
endif
endif
+
+#for loongson
+ifneq ($(filter mips mips64, $(ARCH)),)
+ifeq ($(USE_ASM), Yes)
+ASM_ARCH = mips
+ASMFLAGS += -I$(SRC_PATH)codec/common/mips/
+LOONGSON3A = $(shell g++ -dM -E - < /dev/null | grep '_MIPS_TUNE ' | cut -f 3 -d " ")
+ifeq ($(LOONGSON3A), "loongson3a")
+CFLAGS += -DHAVE_MMI
+endif
+endif
+endif
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -117,9 +117,16 @@
arm64files.append(file)
elif 'arm' in c:
armfiles.append(file)
+mipsfiles = []
+for file in cfiles:
+ c = file.split('/')
+ if 'mips' in c:
+ mipsfiles.append(file)
+cfiles = [x for x in cfiles if x not in mipsfiles]
+
f = open(OUTFILE, "w")
f.write("# This file is autogenerated, do not edit it directly, edit build/mktargets.py\n")
f.write("# instead. To regenerate files, run build/mktargets.sh.\n")
@@ -173,10 +180,21 @@
f.write("endif\n")
f.write("OBJS += $(%s_OBJSARM64)\n\n"%(PREFIX))
+if len(mipsfiles) > 0:
+ f.write("%s_ASM_MIPS_SRCS=\\\n"%(PREFIX))
+ for c in mipsfiles:
+ f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
+ f.write("\n")
+ f.write("%s_OBJSMIPS += $(%s_ASM_MIPS_SRCS:.c=.$(OBJ))\n"%(PREFIX, PREFIX))
+ f.write("ifeq ($(ASM_ARCH), mips)\n")
+ f.write("%s_OBJS += $(%s_OBJSMIPS)\n"%(PREFIX,PREFIX))
+ f.write("endif\n")
+ f.write("OBJS += $(%s_OBJSMIPS)\n\n"%(PREFIX))
+
f.write("OBJS += $(%s_OBJS)\n\n"%(PREFIX))
write_cpp_rule_pattern(f)
-if len(cfiles) > 0:
+if len(cfiles) > 0 or len(mipsfiles) > 0:
write_c_rule_pattern(f)
if len(asm) > 0:
--- /dev/null
+++ b/codec/common/inc/asmdefs_mmi.h
@@ -1,0 +1,340 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Loongson Technology Co.,Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ASMDEFS_MMI_H_
+#define ASMDEFS_MMI_H_
+
+#define CACHE_LINE_SIZE 32
+
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define mips_reg int64_t
+# define PTRSIZE " 8 "
+# define PTRLOG " 3 "
+# define PTR_ADDU "daddu "
+# define PTR_ADDIU "daddiu "
+# define PTR_ADDI "daddi "
+# define PTR_SUBU "dsubu "
+# define PTR_L "ld "
+# define PTR_S "sd "
+# define PTR_SRA "dsra "
+# define PTR_SRL "dsrl "
+# define PTR_SLL "dsll "
+#else
+# define mips_reg int32_t
+# define PTRSIZE " 4 "
+# define PTRLOG " 2 "
+# define PTR_ADDU "addu "
+# define PTR_ADDIU "addiu "
+# define PTR_ADDI "addi "
+# define PTR_SUBU "subu "
+# define PTR_L "lw "
+# define PTR_S "sw "
+# define PTR_SRA "sra "
+# define PTR_SRL "srl "
+# define PTR_SLL "sll "
+#endif
+
+#define MMI_XSawp_BH(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhbh "#f2", "#f0", "#f4" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "punpckhbh "#f10", "#f8", "#f6" \n\t" \
+ "punpcklbh "#f8", "#f8", "#f6" \n\t"
+
+#define MMI_XSawp_HW(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f4" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f4" \n\t" \
+ "punpckhhw "#f10", "#f8", "#f6" \n\t" \
+ "punpcklhw "#f8", "#f8", "#f6" \n\t"
+
+#define MMI_XSawp_WD(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f4" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f4" \n\t" \
+ "punpckhwd "#f10", "#f8", "#f6" \n\t" \
+ "punpcklwd "#f8", "#f8", "#f6" \n\t"
+
+#define MMI_XSawp_DQ(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "mov.d "#f2", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t"
+
+#define WELS_AbsH(f0, f2, f4, f6, f8, f10) \
+ "xor "#f8", "#f8", "#f8" \n\t" \
+ "psubh "#f10", "#f8", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "pmaxsh "#f0", "#f4", "#f8" \n\t" \
+ "pmaxsh "#f2", "#f6", "#f10" \n\t"
+
+#define MMI_SumSub(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "paddh "#f4", "#f4", "#f0" \n\t" \
+ "paddh "#f6", "#f6", "#f2" \n\t" \
+ "psubh "#f0", "#f0", "#f8" \n\t" \
+ "psubh "#f2", "#f2", "#f10" \n\t"
+
+#define MMI_LoadDiff8P(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_TransTwo4x4H(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_XSawp_HW(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
+ MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
+ MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6) \
+ MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
+
+#define MMI_TransTwo8x8B(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28, f30, r0, r1) \
+ "dmfc1 "#r0", "#f28" \n\t" \
+ "dmfc1 "#r1", "#f30" \n\t" \
+ MMI_XSawp_BH(f0, f2, f4, f6, f28, f30) \
+ MMI_XSawp_BH(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_BH(f16, f18, f20, f22, f12, f14) \
+ "dmtc1 "#r0", "#f20" \n\t" \
+ "dmtc1 "#r1", "#f22" \n\t" \
+ "dmfc1 "#r0", "#f12" \n\t" \
+ "dmfc1 "#r1", "#f14" \n\t" \
+ MMI_XSawp_BH(f24, f26, f20, f22, f12, f14) \
+ MMI_XSawp_HW(f0, f2, f8, f10, f20, f22) \
+ MMI_XSawp_HW(f28, f30, f4, f6, f8, f10) \
+ MMI_XSawp_HW(f16, f18, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f8" \n\t" \
+ "dmfc1 "#r1", "#f10" \n\t" \
+ MMI_XSawp_HW(f24, f26, f12, f14, f8, f10) \
+ MMI_XSawp_WD(f0, f2, f16, f18, f12, f14) \
+ MMI_XSawp_WD(f20, f22, f4, f6, f16, f18) \
+ MMI_XSawp_WD(f28, f30, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f16" \n\t" \
+ "dmfc1 "#r1", "#f18" \n\t" \
+ MMI_XSawp_WD(f24, f26, f8, f10, f16, f18) \
+ MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10) \
+ MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30) \
+ MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f0" \n\t" \
+ "dmfc1 "#r1", "#f2" \n\t" \
+ MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2) \
+ "dmtc1 "#r0", "#f16" \n\t" \
+ "dmtc1 "#r1", "#f18" \n\t"
+
+#define MMI_XSwap_HW_SINGLE(f0, f2, f4) \
+ "punpckhhw "#f4", "#f0", "#f2" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_XSwap_WD_SINGLE(f0, f2, f4) \
+ "punpckhwd "#f4", "#f0", "#f2" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_Trans4x4H_SINGLE(f0, f2, f4, f6, f8) \
+ MMI_XSwap_HW_SINGLE(f0, f2, f8) \
+ MMI_XSwap_HW_SINGLE(f4, f6, f2) \
+ MMI_XSwap_WD_SINGLE(f0, f4, f6) \
+ MMI_XSwap_WD_SINGLE(f8, f2, f4)
+
+#define MMI_SumSub_SINGLE(f0, f2, f4) \
+ "mov.d "#f4", "#f2" \n\t" \
+ "psubh "#f2", "#f2", "#f0" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t"
+
+#define MMI_SumSubMul2_SINGLE(f0, f2, f4, f6) \
+ "mov.d "#f4", "#f0" \n\t" \
+ "psllh "#f0", "#f0", "#f6" \n\t" \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "psllh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f4", "#f4", "#f2" \n\t"
+
+//f4 should be 0x0
+#define MMI_Copy8Times(f0, f2, f4, r0) \
+ "dmtc1 "#r0", "#f0" \n\t" \
+ "pshufh "#f0", "#f0", "#f4" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
+
+//f4 should be 0x0
+#define MMI_Copy16Times(f0, f2, f4, r0) \
+ "dmtc1 "#r0", "#f0" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f0" \n\t" \
+ "pshufh "#f0", "#f0", "#f4" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
+
+#define MMI_SumSubDiv2_SINGLE(f0, f2, f4, f6) \
+ "psrah "#f4", "#f2", "#f6" \n\t" \
+ "paddh "#f4", "#f4", "#f0" \n\t" \
+ "psrah "#f0", "#f0", "#f6" \n\t" \
+ "psubh "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_IDCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
+ MMI_SumSub_SINGLE(f6, f8, f10) \
+ MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12) \
+ MMI_SumSub_SINGLE(f0, f6, f10) \
+ MMI_SumSub_SINGLE(f4, f8, f10)
+
+#define MMI_StoreDiff4P_SINGLE(f0, f2, f4, f6, r0, r1, f8) \
+ "gsldlc1 "#f2", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r1") \n\t" \
+ "punpcklbh "#f2", "#f2", "#f6" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "paddsh "#f0", "#f0", "#f2" \n\t" \
+ "packushb "#f0", "#f0", "#f2" \n\t" \
+ "gsswlc1 "#f0", 0x3("#r0") \n\t" \
+ "gsswrc1 "#f0", 0x0("#r0") \n\t"
+
+#define SUMH_HORIZON(f0, f2, f4, f6, f8) \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f0" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t"
+
+#define LOAD_COLUMN(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f8", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f8", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f8", "#f8", "#f4" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f12", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f12", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f12", "#f12", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f8", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f8", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f8", "#f8", "#f4" \n\t" \
+ "punpckhhw "#f14", "#f12", "#f8" \n\t" \
+ "punpcklhw "#f12", "#f12", "#f8" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "punpcklwd "#f0", "#f2", "#f14" \n\t" \
+ "punpckhwd "#f2", "#f2", "#f14" \n\t"
+
+#define LOAD_COLUMN_C(f0, f2, f4, f6, r0, r1, r2) \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r2") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f2" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f4", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r2") \n\t" \
+ "punpcklbh "#f4", "#f4", "#f2" \n\t" \
+ "punpckhhw "#f0", "#f0", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t"
+
+/**
+ * backup register
+ */
+#define BACKUP_REG \
+ double __back_temp[8]; \
+ if (_MIPS_SIM == _ABI64) \
+ __asm__ volatile ( \
+ "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
+ "gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
+ "gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
+ "gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ ); \
+ else \
+ __asm__ volatile ( \
+ "gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
+ "gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
+ "gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ );
+
+/**
+ * recover register
+ */
+#define RECOVER_REG \
+ if (_MIPS_SIM == _ABI64) \
+ __asm__ volatile ( \
+ "gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
+ "gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
+ "gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
+ "gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ ); \
+ else \
+ __asm__ volatile ( \
+ "gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
+ "gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
+ "gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ );
+
+# define OK 1
+# define NOTOK 0
+
+#endif /* ASMDEFS_MMI_H_ */
--- a/codec/common/inc/copy_mb.h
+++ b/codec/common/inc/copy_mb.h
@@ -75,6 +75,13 @@
void WelsCopy8x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
#endif
+#if defined (HAVE_MMI)
+void WelsCopy8x8_mmi (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x16_mmi (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x8NotAligned_mmi (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
+void WelsCopy16x16_mmi (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
+void WelsCopy16x16NotAligned_mmi (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/common/inc/cpu_core.h
+++ b/codec/common/inc/cpu_core.h
@@ -84,6 +84,9 @@
#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
#define WELS_CPU_NEON 0x000004 /* NEON */
+/* For loongson */
+#define WELS_CPU_MMI 0x00000001 /* mmi */
+
/*
* Interfaces for CPU core feature detection as below
*/
--- a/codec/common/inc/deblocking_common.h
+++ b/codec/common/inc/deblocking_common.h
@@ -75,6 +75,22 @@
void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
#endif
+
+#if defined(HAVE_MMI)
+void DeblockLumaLt4V_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_mmi (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_mmi (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTC);
+void DeblockChromaEq4H_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTC);
+void WelsNonZeroCount_mmi (int8_t* pNonZeroCount);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/common/inc/expand_pic.h
+++ b/codec/common/inc/expand_pic.h
@@ -73,6 +73,15 @@
const int32_t kiPicH);
#endif
+#if defined(HAVE_MMI)
+void ExpandPictureLuma_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+ const int32_t kiPicH);
+void ExpandPictureChromaAlign_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+ const int32_t kiPicH);
+void ExpandPictureChromaUnalign_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+ const int32_t kiPicH);
+#endif//HAVE_MMI
+
typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
typedef struct TagExpandPicFunc {
--- a/codec/common/inc/intra_pred_common.h
+++ b/codec/common/inc/intra_pred_common.h
@@ -67,6 +67,11 @@
void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
#endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsI16x16LumaPredV_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -104,6 +104,19 @@
void WelsSampleSadFour8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
void WelsSampleSadFour4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
#endif
+
+#if defined (HAVE_MMI)
+int32_t WelsSampleSad4x4_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/common/mips/copy_mb_mmi.c
@@ -1,0 +1,477 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file copy_mb_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void WelsCopy8x8_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
+ int32_t iStrideS ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0x7($8) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f14, 0x7($8) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0($8) \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f14, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f14, 0x0($8) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsCopy8x16_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
+ int32_t iStrideS) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0x7($8) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f14, 0x7($8) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f14, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0x7($8) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f14, 0x7($8) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0($8) \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f14, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f14, 0x0($8) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsCopy16x16_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+ int32_t iSrcStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f0, $f2, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f4, $f6, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f8, $f10, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f12, $f14, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f16, $f18, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f20, $f22, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f24, $f26, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f28, $f30, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+ "gslqc1 $f0, $f2, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f4, $f6, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f8, $f10, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f12, $f14, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f16, $f18, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f20, $f22, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f24, $f26, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f28, $f30, 0x0(%[pSrc]) \n\t"
+
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsCopy16x16NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+ int32_t iSrcStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f4, $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f12, $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f20, $f22, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f28, $f30, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+
+ "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f4, $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f12, $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f20, $f22, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f28, $f30, 0x0($8) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsCopy16x8NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+ int32_t iSrcStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
+
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/common/mips/deblock_mmi.c
@@ -1,0 +1,2826 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file deblock_mmi.c
+ *
+ * \brief Loongson optimize
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+ int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[512] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dsll $8, %[iStride], 0x1 \n\t"
+ "daddu $8, $8, %[iStride] \n\t"
+ "dsubu $14, %[pPix], $8 \n\t"
+
+ "dsll $8, %[iStride], 0x1 \n\t"
+ "dsubu $9, %[pPix], $8 \n\t"
+
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "dsubu $13, %[pPix], %[iStride] \n\t"
+ "daddu %[iStride], %[iStride], %[pPix] \n\t"
+ "daddu $12, $8, %[pPix] \n\t"
+
+ "punpcklhw $f0, $f0, $f0 \n\t"
+ "lb $8, 0x0(%[pTC]) \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t"
+ "dmtc1 %[iBeta], $f0 \n\t"
+ "lb %[iAlpha], 0x1(%[pTC]) \n\t"
+ "dli %[iBeta], 0xFFFF \n\t"
+ "punpcklhw $f0, $f0, $f0 \n\t"
+ "and $10, %[iAlpha], %[iBeta] \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
+ "dmtc1 $10, $f4 \n\t"
+ "mov.d $f8, $f4 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "and %[iAlpha], $8, %[iBeta] \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "mov.d $f28, $f20 \n\t"
+ "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+
+ "lb %[iAlpha], 0x3(%[pTC]) \n\t"
+ "lb %[pTC], 0x2(%[pTC]) \n\t"
+ "dmtc1 $10, $f12 \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "and $8, %[iAlpha], %[iBeta] \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
+ "dmtc1 $8, $f0 \n\t"
+ "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
+ "mov.d $f8, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "and %[iAlpha], %[pTC], %[iBeta] \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "punpcklhw $f20, $f20, $f0 \n\t"
+
+ "xor $f0, $f0, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f24 \n\t"
+ "and %[pTC], %[pTC], %[iBeta] \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 %[pTC], $f4 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($9) \n\t"
+ "punpckhbh $f10, $f8, $f0 \n\t"
+ "punpcklbh $f8, $f8, $f0 \n\t"
+
+ "dli %[iAlpha], 0x4 \n\t"
+ "seh %[pTC], %[iAlpha] \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
+ "gslqc1 $f14, $f12, 0x0($13) \n\t"
+ "gsldxc1 $f2, 0x0($12, $0) \n\t"
+ "punpckhbh $f22, $f20, $f0 \n\t"
+ "punpcklbh $f20, $f20, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
+ "punpckhbh $f22, $f2, $f0 \n\t"
+ "punpcklbh $f20, $f2, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
+ "punpcklhw $f4, $f4, $f16 \n\t"
+ "gslqc1 $f18, $f16, 0x0($14) \n\t"
+ "punpcklhw $f4, $f4, $f24 \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
+ "punpckhhw $f6, $f4, $f28 \n\t"
+ "punpcklhw $f4, $f4, $f28 \n\t"
+ "punpckhbh $f26, $f24, $f0 \n\t"
+ "punpcklbh $f24, $f24, $f0 \n\t"
+ "punpckhbh $f14, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f12, $f0 \n\t"
+ "punpckhbh $f18, $f16, $f0 \n\t"
+ "punpcklbh $f16, $f16, $f0 \n\t"
+ "psubh $f28, $f12, $f16 \n\t"
+ "psubh $f30, $f14, $f18 \n\t"
+ "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
+ "gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+ "psubh $f28, $f24, $f0 \n\t"
+ "psubh $f30, $f26, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
+ "pavgh $f20, $f12, $f24 \n\t"
+ "pavgh $f22, $f14, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
+ "psubh $f20, $f24, $f12 \n\t"
+ "psubh $f22, $f26, $f14 \n\t"
+ "gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f0 \n\t"
+ "psubh $f26, $f26, $f2 \n\t"
+ "gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
+ "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
+ "pcmpgth $f20, $f20, $f28 \n\t"
+ "pcmpgth $f22, $f22, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+
+ "xor $f0, $f0, $f0 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "pcmpgth $f28, $f24, $f0 \n\t"
+ "pcmpgth $f30, $f26, $f0 \n\t"
+ "pcmpeqh $f24, $f24, $f0 \n\t"
+ "pcmpeqh $f26, $f26, $f0 \n\t"
+ "or $f28, $f28, $f24 \n\t"
+ "or $f30, $f30, $f26 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
+ "dmtc1 %[pTC], $f20 \n\t"
+ "punpckhhw $f26, $f20, $f20 \n\t"
+ "punpcklhw $f24, $f20, $f20 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "mov.d $f22, $f20 \n\t"
+ "gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "psubh $f24, $f0, $f20 \n\t"
+ "dli $11, 0x2 \n\t"
+ "psubh $f26, $f0, $f22 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "psubh $f28, $f8, $f0 \n\t"
+ "psubh $f30, $f10, $f2 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "dli $11, 0x3 \n\t"
+ "dmtc1 $11, $f20 \n\t"
+ "psrah $f28, $f28, $f20 \n\t"
+ "psrah $f30, $f30, $f20 \n\t"
+ "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "pmaxsh $f24, $f24, $f28 \n\t"
+ "pmaxsh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+
+ "and $f20, $f20, $f0 \n\t"
+ "and $f22, $f22, $f2 \n\t"
+ "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t"
+ "psubh $f20, $f0, $f24 \n\t"
+ "psubh $f22, $f0, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "mov.d $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f28, $f8, $f8 \n\t"
+ "paddh $f30, $f10, $f10 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "dli $11, 0x1 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "pmaxsh $f24, $f24, $f20 \n\t"
+ "pmaxsh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f28, $f24, $f24 \n\t"
+ "paddh $f30, $f26, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x1 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t"
+ "pmaxsh $f24, $f24, $f20 \n\t"
+ "pmaxsh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x0($9) \n\t"
+ "punpcklbh $f28, $f30, $f0 \n\t"
+ "punpckhbh $f30, $f30, $f0 \n\t"
+ "gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
+
+ "gslqc1 $f30, $f28, 0x0($12) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x0($14) \n\t"
+ "gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0($13) \n\t"
+ "punpcklbh $f28, $f30, $f0 \n\t"
+ "punpckhbh $f30, $f30, $f0 \n\t"
+ "punpcklbh $f20, $f22, $f0 \n\t"
+ "punpckhbh $f22, $f22, $f0 \n\t"
+ "gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+
+ "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+
+ "psubh $f28, $f24, $f28 \n\t"
+ "psubh $f30, $f26, $f30 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t"
+ "pavgh $f20, $f20, $f24 \n\t"
+ "pavgh $f22, $f22, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t"
+ "psubh $f20, $f4, $f20 \n\t"
+ "psubh $f22, $f6, $f22 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
+ "psubh $f20, $f24, $f20 \n\t"
+ "psubh $f22, $f26, $f22 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "mov.d $f28, $f20 \n\t"
+ "mov.d $f30, $f22 \n\t"
+ WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
+ "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
+ "pcmpgth $f20, $f20, $f28 \n\t"
+ "pcmpgth $f22, $f22, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f24 \n\t"
+ "psubh $f30, $f30, $f26 \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f0 \n\t"
+ "psubh $f26, $f26, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "pcmpgth $f16, $f4, $f0 \n\t"
+ "pcmpgth $f18, $f6, $f0 \n\t"
+ "pcmpeqh $f28, $f4, $f0 \n\t"
+ "pcmpeqh $f30, $f6, $f0 \n\t"
+ "or $f16, $f16, $f28 \n\t"
+ "or $f18, $f18, $f30 \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "dli $11, 0x2 \n\t"
+ "psubh $f28, $f0, $f16 \n\t"
+ "psubh $f30, $f0, $f18 \n\t"
+ "psubh $f2, $f0, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "dmfc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x3 \n\t"
+ "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmtc1 $11, $f0 \n\t"
+ "psrah $f24, $f24, $f0 \n\t"
+ "psrah $f26, $f26, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "pmaxsh $f28, $f28, $f24 \n\t"
+ "pmaxsh $f30, $f30, $f26 \n\t"
+ "pminsh $f16, $f16, $f28 \n\t"
+ "pminsh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "mov.d $f24, $f0 \n\t"
+ "mov.d $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x1 \n\t"
+ "paddh $f16, $f16, $f16 \n\t"
+ "paddh $f18, $f18, $f18 \n\t"
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+
+ "dmtc1 $11, $f28 \n\t"
+ "gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t"
+ "psrah $f0, $f0, $f28 \n\t"
+ "psrah $f2, $f2, $f28 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
+ "pminsh $f28, $f4, $f24 \n\t"
+ "pminsh $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+ "dmfc1 %[iAlpha], $f24 \n\t"
+ "dmfc1 %[iBeta], $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f20, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f0, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t"
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t"
+ "psubh $f16, $f16, $f20 \n\t"
+ "gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t"
+ "psubh $f18, $f18, $f22 \n\t"
+
+ "gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f24 \n\t"
+ "paddh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "packushb $f2, $f16, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "mov.d $f28, $f0 \n\t"
+ "mov.d $f30, $f2 \n\t"
+ "paddh $f0, $f0, $f0 \n\t"
+ "paddh $f2, $f2, $f2 \n\t"
+
+ "dmtc1 %[iAlpha], $f24 \n\t"
+ "dmtc1 %[iBeta], $f26 \n\t"
+
+ "psubh $f16, $f16, $f0 \n\t"
+ "psubh $f18, $f18, $f2 \n\t"
+ "dli $11, 0x1 \n\t"
+ "gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "gssqc1 $f10, $f8, 0x0($9) \n\t"
+ "dmtc1 $11, $f8 \n\t"
+ "psrah $f16, $f16, $f8 \n\t"
+ "psrah $f18, $f18, $f8 \n\t"
+ "pmaxsh $f0, $f0, $f16 \n\t"
+ "pmaxsh $f2, $f2, $f18 \n\t"
+ "pminsh $f4, $f4, $f0 \n\t"
+ "pminsh $f6, $f6, $f2 \n\t"
+ "gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
+
+ "gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "and $f4, $f4, $f8 \n\t"
+ "and $f6, $f6, $f10 \n\t"
+ "gssqc1 $f14, $f12, 0x0($13) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "packushb $f20, $f20, $f22 \n\t"
+ "packushb $f22, $f28, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
+ "gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
+ : [pPix]"+&r"((unsigned char *)pPix)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+ [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
+ uint8_t *pDst) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "dsll $8, %[iStride], 0x3 \n\t"
+ "daddu $8, $8, %[pPixY] \n\t"
+
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f4, 0x7($9) \n\t"
+ "gsldlc1 $f6, 0x7($10) \n\t"
+ "gsldrc1 $f0, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f4, 0x0($9) \n\t"
+ "gsldrc1 $f6, 0x0($10) \n\t"
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7($10) \n\t"
+ "gsldrc1 $f8, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0($10) \n\t"
+
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f18, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($9) \n\t"
+ "gsldlc1 $f22, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f18, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($9) \n\t"
+ "gsldrc1 $f22, 0x0($10) \n\t"
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f26, 0x7($8) \n\t"
+
+ "gsldlc1 $f28, 0x7($9) \n\t"
+ "gsldlc1 $f30, 0x7($10) \n\t"
+ "gsldrc1 $f24, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f26, 0x0($8) \n\t"
+ "gsldrc1 $f28, 0x0($9) \n\t"
+ "gsldrc1 $f30, 0x0($10) \n\t"
+
+ MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $9, $10)
+
+ "gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t"
+ "gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t"
+ "gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t"
+ "gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t"
+ "gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t"
+ : [pPixY] "+&r"((unsigned char *)pPixY)
+ : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+ "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
+ uint8_t *pSrc) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t"
+ "gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t"
+ "gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t"
+ "gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t"
+ "gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t"
+ "gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t"
+
+ MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $9, $10)
+
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f16, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f8, 0x7($8) \n\t"
+ "gssdrc1 $f16, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f8, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f12, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f28, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f28, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f20, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f4, 0x7($8) \n\t"
+ "gssdrc1 $f20, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f4, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f24, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f0, 0x7($8) \n\t"
+ "gssdrc1 $f24, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f0, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f18, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f18, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f14, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f30, 0x7($8) \n\t"
+ "gssdrc1 $f14, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f30, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f22, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f22, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f26, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f26, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ : [pPixY] "+&r"((unsigned char *)pPixY)
+ : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+ "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+ int32_t iBeta) {
+ unsigned char tmp[720] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dsll $11, %[iStride], 0x2 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "daddu $14, %[iStride], %[pPix] \n\t"
+ "dsubu $8, %[pPix], $11 \n\t"
+ "gslqc1 $f14, $f12, 0x0($8) \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t"
+ "daddu $9, %[iStride], %[iStride] \n\t"
+ "daddu $10, $9, %[iStride] \n\t"
+ "move $12, $9 \n\t"
+ "dsubu $8, %[pPix], $9 \n\t"
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "dsubu $9, %[pPix], %[iStride] \n\t"
+ "gslqc1 $f18, $f16, 0x0($9) \n\t"
+ "daddu $13, %[iStride], %[pPix] \n\t"
+
+ "move %[iStride], $12 \n\t"
+ "daddu $15, $12, %[pPix] \n\t"
+
+ "daddu $12, %[pPix], $10 \n\t"
+ "dsubu $11, %[pPix], $10 \n\t"
+
+ "gslqc1 $f26, $f24, 0x0($11) \n\t"
+ "daddu %[iStride], %[iStride], %[pPix] \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+
+ "punpcklhw $f28, $f0, $f0 \n\t"
+ "punpcklwd $f0, $f28, $f28 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
+ "dmtc1 %[iBeta], $f0 \n\t"
+ "gsldxc1 $f10, 0x0($15, $0) \n\t"
+ "punpcklhw $f28, $f0, $f0 \n\t"
+ "punpcklwd $f0, $f28, $f28 \n\t"
+ "punpckhbh $f30, $f10, $f8 \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "punpcklbh $f28, $f10, $f8 \n\t"
+ "gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "mov.d $f0, $f4 \n\t"
+ "gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
+ "mov.d $f4, $f16 \n\t"
+ "punpckhbh $f22, $f20, $f8 \n\t"
+ "punpcklbh $f20, $f20, $f8 \n\t"
+ "punpckhbh $f6, $f4, $f8 \n\t"
+ "punpcklbh $f4, $f4, $f8 \n\t"
+
+ "psubh $f28, $f20, $f4 \n\t"
+ "psubh $f30, $f22, $f6 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
+ "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "punpckhbh $f2, $f0, $f8 \n\t"
+ "punpcklbh $f0, $f0, $f8 \n\t"
+ "gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x0($14) \n\t"
+ "gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
+
+ "psubh $f28, $f4, $f0 \n\t"
+ "psubh $f30, $f6, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
+ "punpckhbh $f18, $f16, $f8 \n\t"
+ "punpcklbh $f16, $f16, $f8 \n\t"
+ "pcmpgth $f0, $f0, $f28 \n\t"
+ "pcmpgth $f2, $f2, $f30 \n\t"
+ "gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t"
+ "psubh $f28, $f20, $f16 \n\t"
+ "psubh $f30, $f22, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
+ "punpckhbh $f26, $f24, $f8 \n\t"
+ "punpcklbh $f24, $f24, $f8 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "dli %[iBeta], 0x2 \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "dmtc1 %[iBeta], $f10 \n\t"
+ "gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
+
+ "punpcklhw $f28, $f16, $f16 \n\t"
+ "psrah $f16, $f0, $f10 \n\t"
+ "psrah $f18, $f2, $f10 \n\t"
+ "punpcklwd $f28, $f28, $f28 \n\t"
+ "mov.d $f30, $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f8 \n\t"
+ "pcmpgth $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+
+ "gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f4, $f24 \n\t"
+ "psubh $f30, $f6, $f26 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t"
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f20, $f0 \n\t"
+ "psubh $f30, $f22, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+
+ "gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "pandn $f16, $f16, $f24 \n\t"
+ "dli %[iAlpha], 0x4 \n\t"
+ "pandn $f18, $f18, $f26 \n\t"
+ "gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "punpcklhw $f28, $f16, $f16 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "punpckhbh $f18, $f12, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f30 \n\t"
+ "punpcklbh $f16, $f12, $f8 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "paddh $f16, $f16, $f0 \n\t"
+ "paddh $f18, $f18, $f2 \n\t"
+
+ "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "punpcklwd $f28, $f28, $f28 \n\t"
+ "mov.d $f30, $f28 \n\t"
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t"
+ "pandn $f24, $f24, $f28 \n\t"
+ "pandn $f26, $f26, $f30 \n\t"
+ "gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0($12) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "punpckhbh $f26, $f24, $f8 \n\t"
+ "punpcklbh $f24, $f24, $f8 \n\t"
+ "psllh $f24, $f24, $f10 \n\t"
+ "psllh $f26, $f26, $f10 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+ "gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "pandn $f24, $f24, $f28 \n\t"
+ "pandn $f26, $f26, $f30 \n\t"
+ "gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "psrah $f16, $f16, $f10 \n\t"
+ "psrah $f18, $f18, $f10 \n\t"
+ "and $f16, $f16, $f0 \n\t"
+ "and $f18, $f18, $f2 \n\t"
+ "gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f4, $f20 \n\t"
+ "paddh $f30, $f6, $f22 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f20, $f20, $f4 \n\t"
+ "paddh $f22, $f22, $f6 \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f24 \n\t"
+ "pandn $f30, $f30, $f26 \n\t"
+ "gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f24 \n\t"
+ "paddh $f30, $f30, $f26 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t"
+ "psrah $f28, $f28, $f10 \n\t"
+ "psrah $f30, $f30, $f10 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f20, $f20 \n\t"
+ "paddh $f30, $f22, $f22 \n\t"
+ "paddh $f20, $f4, $f24 \n\t"
+ "paddh $f22, $f6, $f26 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t"
+ "psrah $f28, $f28, $f10 \n\t"
+ "psrah $f30, $f30, $f10 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "pandn $f20, $f20, $f28 \n\t"
+ "pandn $f22, $f22, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t"
+ "psllh $f28, $f28, $f10 \n\t"
+ "psllh $f30, $f30, $f10 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "paddh $f28, $f28, $f24 \n\t"
+ "paddh $f30, $f30, $f26 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+
+ "dli %[iAlpha], 0x2 \n\t"
+ "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f10 \n\t"
+ "psrah $f22, $f22, $f10 \n\t"
+ "and $f4, $f4, $f20 \n\t"
+ "and $f6, $f6, $f22 \n\t"
+ "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t"
+ "paddh $f24, $f4, $f4 \n\t"
+ "paddh $f26, $f6, $f6 \n\t"
+ "paddh $f4, $f4, $f8 \n\t"
+ "paddh $f6, $f6, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f4, $f4, $f8 \n\t"
+ "paddh $f6, $f6, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "psrah $f24, $f24, $f8 \n\t"
+ "psrah $f26, $f26, $f8 \n\t"
+ "psllh $f4, $f4, $f10 \n\t"
+ "psllh $f6, $f6, $f10 \n\t"
+ "paddh $f4, $f4, $f20 \n\t"
+ "paddh $f6, $f6, $f22 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+
+ "gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f24 \n\t"
+ "pandn $f30, $f30, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ "gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
+ "punpcklbh $f4, $f6, $f8 \n\t"
+ "punpckhbh $f6, $f6, $f8 \n\t"
+ "gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t"
+ "punpcklbh $f4, $f6, $f8 \n\t"
+ "punpckhbh $f6, $f6, $f8 \n\t"
+ "punpcklbh $f24, $f26, $f8 \n\t"
+ "punpckhbh $f26, $f26, $f8 \n\t"
+ "gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
+ "punpcklbh $f20, $f22, $f8 \n\t"
+ "punpckhbh $f22, $f22, $f8 \n\t"
+ "gslqc1 $f30, $f28, 0x0($14) \n\t"
+ "gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t"
+
+ "gsldxc1 $f0, 0x8($15, $0) \n\t"
+ "punpcklbh $f28, $f30, $f8 \n\t"
+ "punpckhbh $f30, $f30, $f8 \n\t"
+ "gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
+
+ "punpcklbh $f28, $f0, $f8 \n\t"
+ "punpckhbh $f30, $f0, $f8 \n\t"
+ "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
+
+ "psubh $f28, $f24, $f4 \n\t"
+ "psubh $f30, $f26, $f6 \n\t"
+ "psubh $f24, $f24, $f8 \n\t"
+ "psubh $f26, $f26, $f10 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
+ "psubh $f28, $f4, $f28 \n\t"
+ "psubh $f30, $f6, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f4, $f0, $f28 \n\t"
+ "pcmpgth $f6, $f2, $f30 \n\t"
+ "pcmpgth $f28, $f0, $f24 \n\t"
+ "pcmpgth $f30, $f2, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t"
+ "and $f4, $f4, $f28 \n\t"
+ "and $f6, $f6, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t"
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "punpcklbh $f12, $f14, $f8 \n\t"
+ "punpckhbh $f14, $f14, $f8 \n\t"
+ "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f8 \n\t"
+ "psubh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+
+ "psllh $f12, $f12, $f10 \n\t"
+ "psllh $f14, $f14, $f10 \n\t"
+ "gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
+
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f8 \n\t"
+ "paddh $f14, $f14, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f8 \n\t"
+ "paddh $f14, $f14, $f10 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+ "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
+
+ "gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t"
+ "and $f24, $f0, $f16 \n\t"
+ "and $f26, $f2, $f18 \n\t"
+ "pandn $f16, $f0, $f28 \n\t"
+ "pandn $f18, $f2, $f30 \n\t"
+ "or $f24, $f24, $f16 \n\t"
+ "or $f26, $f26, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f12, $f12, $f28 \n\t"
+ "psrah $f14, $f14, $f28 \n\t"
+ "and $f12, $f12, $f8 \n\t"
+ "and $f14, $f14, $f10 \n\t"
+ "pandn $f8, $f8, $f20 \n\t"
+ "pandn $f10, $f10, $f22 \n\t"
+ "or $f12, $f12, $f8 \n\t"
+ "or $f14, $f14, $f10 \n\t"
+ "and $f28, $f4, $f12 \n\t"
+ "and $f30, $f6, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
+ "or $f12, $f12, $f8 \n\t"
+ "or $f14, $f14, $f10 \n\t"
+ "pandn $f8, $f4, $f20 \n\t"
+ "pandn $f10, $f6, $f22 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+
+ "dli %[iAlpha], 0x2 \n\t"
+ "and $f8, $f0, $f12 \n\t"
+ "and $f10, $f2, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t"
+ "pandn $f12, $f0, $f12 \n\t"
+ "pandn $f14, $f2, $f14 \n\t"
+ "or $f8, $f8, $f12 \n\t"
+ "or $f10, $f10, $f14 \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f28, $f30 \n\t"
+ "gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "paddh $f8, $f20, $f8 \n\t"
+ "paddh $f10, $f22, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f16 \n\t"
+ "paddh $f30, $f30, $f18 \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f8, $f8, $f28 \n\t"
+ "psrah $f10, $f10, $f28 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f8 \n\t"
+ "pandn $f30, $f30, $f10 \n\t"
+ "or $f24, $f24, $f28 \n\t"
+ "or $f26, $f26, $f30 \n\t"
+ "and $f12, $f4, $f24 \n\t"
+ "and $f14, $f6, $f26 \n\t"
+ "pandn $f24, $f4, $f8 \n\t"
+ "pandn $f26, $f6, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "or $f12, $f12, $f24 \n\t"
+ "or $f14, $f14, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f12, $f14 \n\t"
+ "psllh $f8, $f8, $f28 \n\t"
+ "psllh $f10, $f10, $f28 \n\t"
+ "gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t"
+ "or $f24, $f24, $f28 \n\t"
+ "or $f26, $f26, $f30 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+
+ "and $f12, $f0, $f24 \n\t"
+ "and $f14, $f2, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t"
+ "pandn $f24, $f0, $f24 \n\t"
+ "pandn $f26, $f2, $f26 \n\t"
+ "or $f12, $f12, $f24 \n\t"
+ "or $f14, $f14, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t"
+ "gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "and $f24, $f24, $f20 \n\t"
+ "and $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f16, $f12, $f12 \n\t"
+ "paddh $f18, $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f8 \n\t"
+ "paddh $f18, $f18, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f16, $f16, $f28 \n\t"
+ "psrah $f18, $f18, $f28 \n\t"
+ "pandn $f8, $f8, $f16 \n\t"
+ "pandn $f10, $f10, $f18 \n\t"
+ "or $f24, $f24, $f8 \n\t"
+ "or $f26, $f26, $f10 \n\t"
+ "and $f28, $f4, $f24 \n\t"
+ "and $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t"
+ "pandn $f8, $f4, $f24 \n\t"
+ "pandn $f10, $f6, $f26 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f28, $f30 \n\t"
+ "gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
+ "or $f8, $f8, $f28 \n\t"
+ "or $f10, $f10, $f30 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+
+ "and $f16, $f0, $f8 \n\t"
+ "and $f18, $f2, $f10 \n\t"
+ "paddh $f20, $f20, $f24 \n\t"
+ "paddh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
+ "pandn $f8, $f0, $f28 \n\t"
+ "pandn $f10, $f2, $f30 \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psrah $f8, $f8, $f28 \n\t"
+ "psrah $f10, $f10, $f28 \n\t"
+ "gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
+ "paddh $f20, $f8, $f8 \n\t"
+ "paddh $f22, $f10, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "pandn $f12, $f12, $f20 \n\t"
+ "pandn $f14, $f14, $f22 \n\t"
+ "or $f16, $f16, $f12 \n\t"
+ "or $f18, $f18, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
+ "or $f12, $f12, $f28 \n\t"
+ "or $f14, $f14, $f30 \n\t"
+ "and $f28, $f4, $f16 \n\t"
+ "and $f30, $f6, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
+ "pandn $f8, $f4, $f16 \n\t"
+ "pandn $f10, $f6, $f18 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f8 \n\t"
+ "paddh $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f28, $f30 \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t"
+
+ "and $f8, $f0, $f12 \n\t"
+ "and $f10, $f2, $f14 \n\t"
+ "gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t"
+ "pandn $f12, $f0, $f28 \n\t"
+ "pandn $f14, $f2, $f30 \n\t"
+ "or $f8, $f8, $f12 \n\t"
+ "or $f10, $f10, $f14 \n\t"
+ "gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
+ "paddh $f12, $f8, $f28 \n\t"
+ "paddh $f14, $f10, $f30 \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f12, $f12, $f28 \n\t"
+ "psrah $f14, $f14, $f28 \n\t"
+ "and $f24, $f24, $f12 \n\t"
+ "and $f26, $f26, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
+ "pandn $f16, $f12, $f20 \n\t"
+ "pandn $f18, $f14, $f22 \n\t"
+ "or $f24, $f24, $f16 \n\t"
+ "or $f26, $f26, $f18 \n\t"
+ "and $f28, $f4, $f24 \n\t"
+ "and $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t"
+ "pandn $f16, $f4, $f20 \n\t"
+ "pandn $f18, $f6, $f22 \n\t"
+ "or $f28, $f28, $f16 \n\t"
+ "or $f30, $f30, $f18 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f28, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t"
+ "or $f28, $f28, $f16 \n\t"
+ "or $f30, $f30, $f18 \n\t"
+ "and $f16, $f0, $f28 \n\t"
+ "and $f18, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "pandn $f0, $f0, $f28 \n\t"
+ "pandn $f2, $f2, $f30 \n\t"
+ "or $f16, $f16, $f0 \n\t"
+ "or $f18, $f18, $f2 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gslqc1 $f2, $f0, 0x0($12) \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "punpcklbh $f0, $f2, $f30 \n\t"
+ "punpckhbh $f2, $f2, $f30 \n\t"
+ "psllh $f0, $f0, $f28 \n\t"
+ "psllh $f2, $f2, $f28 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f0, $f0, $f28 \n\t"
+ "psrah $f2, $f2, $f28 \n\t"
+ "and $f0, $f0, $f12 \n\t"
+ "and $f2, $f2, $f14 \n\t"
+ "pandn $f12, $f12, $f8 \n\t"
+ "pandn $f14, $f14, $f10 \n\t"
+ "or $f0, $f0, $f12 \n\t"
+ "or $f2, $f2, $f14 \n\t"
+ "and $f28, $f4, $f0 \n\t"
+ "and $f30, $f6, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t"
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+
+ "gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($8) \n\t"
+ "gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t"
+ "gssqc1 $f2, $f0, 0x0($9) \n\t"
+ "gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t"
+
+ "pandn $f4, $f4, $f8 \n\t"
+ "pandn $f6, $f6, $f10 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
+ "or $f28, $f28, $f4 \n\t"
+ "or $f30, $f30, $f6 \n\t"
+ "packushb $f16, $f16, $f18 \n\t"
+ "packushb $f18, $f28, $f30 \n\t"
+ "gssqc1 $f26, $f24, 0x0($13) \n\t"
+ "gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t"
+ : [pPix]"+&r"((unsigned char *)pPix)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+ "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[256] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "lb $8, 0x2(%[pTC]) \n\t"
+ "lb $9, 0x3(%[pTC]) \n\t"
+ "move $11, $8 \n\t"
+ "lb $8, 0x1(%[pTC]) \n\t"
+ "lb %[pTC], 0x0(%[pTC]) \n\t"
+ "move $12, %[pTC] \n\t"
+ "and %[pTC], $9, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f4 \n\t"
+ "and %[pTC], $9, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f8 \n\t"
+ "move %[pTC], $11 \n\t"
+ "and $9, %[pTC], 0xFFFF \n\t"
+ "and %[pTC], %[pTC], 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f16 \n\t"
+ "and %[pTC], $8, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f20 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "and %[pTC], $8, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f24 \n\t"
+ "move %[pTC], $12 \n\t"
+ "and $9, %[pTC], 0xFFFF \n\t"
+ "and %[pTC], %[pTC], 0xFFFF \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
+ "dmtc1 $9, $f28 \n\t"
+ "dmtc1 %[pTC], $f0 \n\t"
+ "daddu %[pTC], %[iStride], %[iStride] \n\t"
+ "dsubu $9, %[pPixCb], %[pTC] \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+ "gsldxc1 $f24, 0x0($9, $0) \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "dsubu $9, %[pPixCr], %[pTC] \n\t"
+ "psubh $f8, $f4, $f0 \n\t"
+ "psubh $f10, $f6, $f2 \n\t"
+ "gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f26, $f8 \n\t"
+ "dsubu %[pTC], %[pPixCb], %[iStride] \n\t"
+ "gsldxc1 $f28, 0x0(%[pTC], $0) \n\t"
+ "dsubu $9, %[pPixCr], %[iStride] \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f30, $f8 \n\t"
+ "gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t"
+ "mov.d $f14, $f8 \n\t"
+ "gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "punpcklhw $f16, $f8, $f8 \n\t"
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f20, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f20, $f20 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "punpckhbh $f10, $f24, $f4 \n\t"
+ "punpcklbh $f8, $f24, $f4 \n\t"
+ "gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t"
+ "punpcklwd $f16, $f16, $f16 \n\t"
+ "mov.d $f18, $f16 \n\t"
+ "gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t"
+ "mov.d $f8, $f28 \n\t"
+ "mov.d $f10, $f30 \n\t"
+ "punpcklbh $f28, $f30, $f6 \n\t"
+ "punpckhbh $f30, $f30, $f6 \n\t"
+ "punpckhbh $f22, $f20, $f4 \n\t"
+ "punpcklbh $f20, $f20, $f4 \n\t"
+ "gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t"
+ "punpckhbh $f14, $f12, $f4 \n\t"
+ "punpcklbh $f12, $f12, $f4 \n\t"
+ "dli %[iBeta], 0x4 \n\t"
+ "punpckhbh $f10, $f8, $f4 \n\t"
+ "punpcklbh $f8, $f8, $f4 \n\t"
+ "dmtc1 %[iBeta], $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "pcmpgth $f24, $f0, $f4 \n\t"
+ "pcmpgth $f26, $f2, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ "dmfc1 %[iAlpha], $f12 \n\t"
+ "dmfc1 %[iBeta], $f14 \n\t"
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f12 \n\t"
+ "dli $10, 0x3 \n\t"
+ "dmtc1 $10, $f14 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f14 \n\t"
+ "psrah $f26, $f26, $f14 \n\t"
+ "dmtc1 %[iAlpha], $f12 \n\t"
+ "dmtc1 %[iBeta], $f14 \n\t"
+ "pmaxsh $f4, $f4, $f24 \n\t"
+ "pmaxsh $f6, $f6, $f26 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
+ "pminsh $f24, $f24, $f4 \n\t"
+ "pminsh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f12 \n\t"
+ "psubh $f6, $f10, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
+ "pcmpgth $f24, $f16, $f4 \n\t"
+ "pcmpgth $f26, $f18, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ "dmfc1 %[iAlpha], $f8 \n\t"
+ "dmfc1 %[iBeta], $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
+ "pcmpgth $f28, $f28, $f4 \n\t"
+ "pcmpgth $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
+ "and $f24, $f24, $f28 \n\t"
+ "and $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f12 \n\t"
+ "psubh $f22, $f22, $f14 \n\t"
+ WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
+ "pcmpgth $f4, $f4, $f20 \n\t"
+ "pcmpgth $f6, $f6, $f22 \n\t"
+ "gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f8 \n\t"
+ "psubh $f22, $f22, $f10 \n\t"
+ "and $f24, $f24, $f4 \n\t"
+ "and $f26, $f26, $f6 \n\t"
+ "gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f8 \n\t"
+ "psllh $f24, $f24, $f8 \n\t"
+ "psllh $f26, $f26, $f8 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dli $10, 0x3 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dmtc1 $10, $f8 \n\t"
+ "gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f8 \n\t"
+ "psrah $f26, $f26, $f8 \n\t"
+ "pmaxsh $f20, $f20, $f24 \n\t"
+ "pmaxsh $f22, $f22, $f26 \n\t"
+ "pminsh $f0, $f0, $f20 \n\t"
+ "pminsh $f2, $f2, $f22 \n\t"
+ "gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t"
+ "psubh $f24, $f4, $f20 \n\t"
+ "psubh $f26, $f6, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "pcmpgth $f16, $f16, $f24 \n\t"
+ "pcmpgth $f18, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "dmtc1 %[iBeta], $f10 \n\t"
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "and $f16, $f16, $f24 \n\t"
+ "and $f18, $f18, $f26 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "paddh $f4, $f4, $f0 \n\t"
+ "paddh $f6, $f6, $f2 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f4, $f6 \n\t"
+ "gssdxc1 $f8, 0x0(%[pTC], $0) \n\t"
+ "psubh $f12, $f12, $f16 \n\t"
+ "psubh $f14, $f14, $f18 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f20, $f22 \n\t"
+ "gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "gssdxc1 $f10, 0x0($9, $0) \n\t"
+ "gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+ [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta) {
+ unsigned char tmp[128] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddu $8, %[iStride], %[iStride] \n\t"
+ "dsubu $9, %[pPixCb], $8 \n\t"
+ "gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t"
+ "gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t"
+ "gsldxc1 $f4, 0x0($9, $0) \n\t"
+ "dsubu $9, %[pPixCr], $8 \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "dsubu $8, %[pPixCb], %[iStride] \n\t"
+ "gsldxc1 $f8, 0x0($8, $0) \n\t"
+ "dsubu $9, %[pPixCr], %[iStride] \n\t"
+ "gsldxc1 $f12, 0x0($9, $0) \n\t"
+ "mov.d $f10, $f12 \n\t"
+ "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "mov.d $f14, $f16 \n\t"
+ "gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t"
+ "mov.d $f18, $f20 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "punpcklhw $f24, $f20, $f20 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "mov.d $f22, $f20 \n\t"
+ "dmtc1 %[iBeta], $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "mov.d $f28, $f4 \n\t"
+ "punpcklbh $f4, $f6, $f2 \n\t"
+ "punpckhbh $f6, $f6, $f2 \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
+ "punpckhbh $f30, $f8, $f0 \n\t"
+ "punpcklbh $f28, $f8, $f0 \n\t"
+ "gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t"
+ "punpckhbh $f30, $f12, $f0 \n\t"
+ "punpcklbh $f28, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f14, $f2 \n\t"
+ "punpckhbh $f14, $f14, $f2 \n\t"
+ "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "mov.d $f28, $f16 \n\t"
+ "punpcklbh $f16, $f18, $f2 \n\t"
+ "punpckhbh $f18, $f18, $f2 \n\t"
+ "punpcklbh $f8, $f10, $f2 \n\t"
+ "punpckhbh $f10, $f10, $f2 \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t"
+ "psubh $f4, $f12, $f0 \n\t"
+ "psubh $f6, $f14, $f2 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
+ "pcmpgth $f0, $f20, $f4 \n\t"
+ "pcmpgth $f2, $f22, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f12 \n\t"
+ "psubh $f6, $f6, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
+ "psubh $f4, $f28, $f16 \n\t"
+ "psubh $f6, $f30, $f18 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f4 \n\t"
+ "psubh $f6, $f10, $f6 \n\t"
+ "dmfc1 %[iAlpha], $f28 \n\t"
+ "dmfc1 %[iBeta], $f30 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+ "pcmpgth $f20, $f20, $f4 \n\t"
+ "pcmpgth $f22, $f22, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f28 \n\t"
+ "psubh $f6, $f6, $f30 \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 %[iBeta], $f30 \n\t"
+ "pcmpgth $f24, $f24, $f4 \n\t"
+ "pcmpgth $f26, $f26, $f6 \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "dli %[iBeta], 0x2 \n\t"
+ "dmtc1 %[iBeta], $f4 \n\t"
+ "punpcklhw $f16, $f4, $f4 \n\t"
+ "punpcklwd $f4, $f16, $f16 \n\t"
+ "mov.d $f6, $f4 \n\t"
+ "gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t"
+ "paddh $f24, $f16, $f16 \n\t"
+ "paddh $f26, $f18, $f18 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "dmtc1 %[iBeta], $f16 \n\t"
+ "psrah $f24, $f24, $f16 \n\t"
+ "psrah $f26, $f26, $f16 \n\t"
+ "pandn $f16, $f0, $f12 \n\t"
+ "pandn $f18, $f2, $f14 \n\t"
+ "gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t"
+ "and $f4, $f0, $f24 \n\t"
+ "and $f6, $f2, $f26 \n\t"
+ "or $f4, $f4, $f16 \n\t"
+ "or $f6, $f6, $f18 \n\t"
+ "paddh $f24, $f12, $f12 \n\t"
+ "paddh $f26, $f14, $f14 \n\t"
+ "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "dmtc1 %[iBeta], $f16 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "psrah $f24, $f24, $f16 \n\t"
+ "psrah $f26, $f26, $f16 \n\t"
+ "and $f16, $f20, $f24 \n\t"
+ "and $f18, $f22, $f26 \n\t"
+ "pandn $f24, $f20, $f8 \n\t"
+ "pandn $f26, $f22, $f10 \n\t"
+ "or $f16, $f16, $f24 \n\t"
+ "or $f18, $f18, $f26 \n\t"
+ "packushb $f4, $f4, $f6 \n\t"
+ "packushb $f6, $f16, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
+ "paddh $f24, $f28, $f28 \n\t"
+ "paddh $f26, $f30, $f30 \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dmtc1 %[iBeta], $f28 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "psrah $f24, $f24, $f28 \n\t"
+ "psrah $f26, $f26, $f28 \n\t"
+ "and $f8, $f0, $f24 \n\t"
+ "and $f10, $f2, $f26 \n\t"
+ "pandn $f0, $f0, $f16 \n\t"
+ "pandn $f2, $f2, $f18 \n\t"
+ "or $f8, $f8, $f0 \n\t"
+ "or $f10, $f10, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f0, $f0 \n\t"
+ "paddh $f26, $f2, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "gssdxc1 $f4, 0x0($8, $0) \n\t"
+ "psrah $f24, $f24, $f28 \n\t"
+ "psrah $f26, $f26, $f28 \n\t"
+ "and $f16, $f20, $f24 \n\t"
+ "and $f18, $f22, $f26 \n\t"
+ "pandn $f20, $f20, $f0 \n\t"
+ "pandn $f22, $f22, $f2 \n\t"
+ "or $f16, $f16, $f20 \n\t"
+ "or $f18, $f18, $f22 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f16, $f18 \n\t"
+ "gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t"
+ "gssdxc1 $f6, 0x0($9, $0) \n\t"
+ "gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta) {
+ unsigned char tmp[256] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
+ "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
+ "move $9, %[pPixCb] \n\t"
+ "move $10, %[pPixCr] \n\t"
+ "dsll $11, %[iStride], 0x2 \n\t"
+ "daddu %[pPixCb], %[pPixCb], $11 \n\t"
+ "daddu %[pPixCr], %[pPixCr], $11 \n\t"
+ "daddiu $11, %[tmp], 0x80 \n\t"
+ "gsldlc1 $f0, 0x7($9) \n\t"
+ "gsldrc1 $f0, 0x0($9) \n\t"
+ "daddu $12, $9, %[iStride] \n\t"
+ "gsldlc1 $f4, 0x7($12) \n\t"
+ "gsldrc1 $f4, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7($12) \n\t"
+ "gsldrc1 $f8, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f12, 0x7($12) \n\t"
+ "gsldlc1 $f16, 0x7($10) \n\t"
+ "gsldrc1 $f12, 0x0($12) \n\t"
+ "gsldrc1 $f16, 0x0($10) \n\t"
+ "daddu $12, $10, %[iStride] \n\t"
+ "gsldlc1 $f20, 0x7($12) \n\t"
+ "gsldrc1 $f20, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7($12) \n\t"
+ "gsldrc1 $f24, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f28, 0x7($12) \n\t"
+ "gsldrc1 $f28, 0x0($12) \n\t"
+ "punpcklwd $f0, $f0, $f16 \n\t"
+ "punpcklwd $f4, $f4, $f20 \n\t"
+ "punpcklwd $f8, $f8, $f24 \n\t"
+ "punpcklwd $f12, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixCb]) \n\t"
+ "gsldlc1 $f20, 0x7(%[pPixCr]) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixCb]) \n\t"
+ "gsldrc1 $f20, 0x0(%[pPixCr]) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "daddu $12, %[pPixCb], %[iStride] \n\t"
+ "daddu $13, %[pPixCr], %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f6, $f16 \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "daddu $13, $13, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "daddu $13, $13, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f14, $f16 \n\t"
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+ "gssqc1 $f22, $f20, 0x10($11) \n\t"
+ "gssqc1 $f6, $f4, 0x20($11) \n\t"
+ "gssqc1 $f26, $f24, 0x30($11) \n\t"
+ "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f4 \n\t"
+ "punpcklhw $f8, $f4, $f4 \n\t"
+ "punpcklwd $f4, $f8, $f8 \n\t"
+ "mov.d $f6, $f4 \n\t"
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f12, $f12 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "mov.d $f12, $f24 \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "punpckhbh $f18, $f16, $f0 \n\t"
+ "punpcklbh $f16, $f16, $f0 \n\t"
+ "punpckhbh $f22, $f20, $f0 \n\t"
+ "punpcklbh $f20, $f20, $f0 \n\t"
+ "punpckhbh $f14, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f12, $f0 \n\t"
+ "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f16, $f20 \n\t"
+ "psubh $f26, $f18, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f0, $f4, $f24 \n\t"
+ "pcmpgth $f2, $f6, $f26 \n\t"
+ "psubh $f24, $f12, $f16 \n\t"
+ "psubh $f26, $f14, $f18 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ "and $f0, $f0, $f28 \n\t"
+ "and $f2, $f2, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "dmfc1 %[iAlpha], $f20 \n\t"
+ "dmfc1 %[iBeta], $f22 \n\t"
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "pcmpgth $f4, $f4, $f24 \n\t"
+ "pcmpgth $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "and $f0, $f0, $f28 \n\t"
+ "and $f2, $f2, $f30 \n\t"
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "dli $8, 0x2 \n\t"
+ "and $f4, $f4, $f28 \n\t"
+ "and $f6, $f6, $f30 \n\t"
+ "pcmpgth $f8, $f8, $f24 \n\t"
+ "pcmpgth $f10, $f10, $f26 \n\t"
+ "and $f4, $f4, $f8 \n\t"
+ "and $f6, $f6, $f10 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "punpcklhw $f24, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f24, $f24 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f8, $f12, $f12 \n\t"
+ "paddh $f10, $f14, $f14 \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f24 \n\t"
+ "paddh $f10, $f10, $f26 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrah $f8, $f8, $f20 \n\t"
+ "psrah $f10, $f10, $f20 \n\t"
+ "and $f24, $f0, $f8 \n\t"
+ "and $f26, $f2, $f10 \n\t"
+ "pandn $f8, $f0, $f16 \n\t"
+ "pandn $f10, $f2, $f18 \n\t"
+ "or $f24, $f24, $f8 \n\t"
+ "or $f26, $f26, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "paddh $f28, $f8, $f8 \n\t"
+ "paddh $f30, $f10, $f10 \n\t"
+ "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f16 \n\t"
+ "paddh $f30, $f30, $f18 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "pandn $f8, $f4, $f20 \n\t"
+ "pandn $f10, $f6, $f22 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrah $f28, $f28, $f20 \n\t"
+ "psrah $f30, $f30, $f20 \n\t"
+ "and $f16, $f4, $f28 \n\t"
+ "and $f18, $f6, $f30 \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f16, $f18 \n\t"
+ "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "paddh $f24, $f8, $f8 \n\t"
+ "paddh $f26, $f10, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "dmtc1 %[iBeta], $f22 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "mov.d $f16, $f0 \n\t"
+ "mov.d $f18, $f2 \n\t"
+ "pandn $f0, $f0, $f20 \n\t"
+ "pandn $f2, $f2, $f22 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "psrah $f24, $f24, $f20 \n\t"
+ "psrah $f26, $f26, $f20 \n\t"
+ "and $f16, $f16, $f24 \n\t"
+ "and $f18, $f18, $f26 \n\t"
+ "or $f16, $f16, $f0 \n\t"
+ "or $f18, $f18, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t"
+ "paddh $f20, $f0, $f0 \n\t"
+ "paddh $f22, $f2, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+ "gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "psrah $f20, $f20, $f8 \n\t"
+ "psrah $f22, $f22, $f8 \n\t"
+ "and $f12, $f4, $f20 \n\t"
+ "and $f14, $f6, $f22 \n\t"
+ "pandn $f4, $f4, $f0 \n\t"
+ "pandn $f6, $f6, $f2 \n\t"
+ "or $f12, $f12, $f4 \n\t"
+ "or $f14, $f14, $f6 \n\t"
+ "packushb $f16, $f16, $f18 \n\t"
+ "packushb $f18, $f12, $f14 \n\t"
+ "gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0x0($11) \n\t"
+ "gslqc1 $f6, $f4, 0x10($11) \n\t"
+ "gslqc1 $f10, $f8, 0x20($11) \n\t"
+ "gslqc1 $f14, $f12, 0x30($11) \n\t"
+ "mov.d $f26, $f2 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+ "punpcklbh $f28, $f30, $f14 \n\t"
+ "punpckhbh $f30, $f30, $f14 \n\t"
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "dli %[iAlpha], 0x20 \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "gsswlc1 $f0, 0x3($9) \n\t"
+ "gsswrc1 $f0, 0x0($9) \n\t"
+ "daddu $12, $9, %[iStride] \n\t"
+ "gsswlc1 $f20, 0x3($12) \n\t"
+ "gsswrc1 $f20, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($12) \n\t"
+ "gsswrc1 $f4, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f24, 0x3($12) \n\t"
+ "gsswrc1 $f24, 0x0($12) \n\t"
+ "dsrl $f0, $f0, $f8 \n\t"
+ "dsrl $f20, $f20, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+ "dsrl $f24, $f24, $f8 \n\t"
+ "gsswlc1 $f0, 0x3($10) \n\t"
+ "gsswrc1 $f0, 0x0($10) \n\t"
+ "daddu $13, $10, %[iStride] \n\t"
+ "daddu $8, $13, %[iStride] \n\t"
+ "gsswlc1 $f20, 0x3($13) \n\t"
+ "gsswrc1 $f20, 0x0($13) \n\t"
+ "daddu $13, $8, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($8) \n\t"
+ "gsswrc1 $f4, 0x0($8) \n\t"
+ "gsswlc1 $f24, 0x3($13) \n\t"
+ "gsswrc1 $f24, 0x0($13) \n\t"
+ "gsswlc1 $f2, 0x3(%[pPixCb]) \n\t"
+ "gsswrc1 $f2, 0x0(%[pPixCb]) \n\t"
+ "daddu $12, %[pPixCb], %[iStride] \n\t"
+ "gsswlc1 $f22, 0x3($12) \n\t"
+ "gsswrc1 $f22, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($12) \n\t"
+ "gsswrc1 $f6, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f26, 0x3($12) \n\t"
+ "gsswrc1 $f26, 0x0($12) \n\t"
+ "dsrl $f2, $f2, $f8 \n\t"
+ "dsrl $f22, $f22, $f8 \n\t"
+ "dsrl $f6, $f6, $f8 \n\t"
+ "dsrl $f26, $f26, $f8 \n\t"
+ "gsswlc1 $f2, 0x3(%[pPixCr]) \n\t"
+ "gsswrc1 $f2, 0x0(%[pPixCr]) \n\t"
+ "daddu $13, %[pPixCr], %[iStride] \n\t"
+ "daddu $8, $13, %[iStride] \n\t"
+ "gsswlc1 $f22, 0x3($13) \n\t"
+ "gsswrc1 $f22, 0x0($13) \n\t"
+ "daddu $13, $8, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($8) \n\t"
+ "gsswrc1 $f6, 0x0($8) \n\t"
+ "gsswlc1 $f26, 0x3($13) \n\t"
+ "gsswrc1 $f26, 0x0($13) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[320] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
+ "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
+ "daddu $8, %[pPixCb], %[iStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pPixCb]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pPixCb]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+
+ "daddu $10, %[pPixCr], %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixCr]) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixCr]) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7($11) \n\t"
+ "gsldlc1 $f28, 0x7($10) \n\t"
+ "gsldrc1 $f24, 0x0($11) \n\t"
+ "gsldrc1 $f28, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "punpcklwd $f0, $f0, $f16 \n\t"
+ "punpcklwd $f4, $f4, $f20 \n\t"
+ "punpcklwd $f8, $f8, $f24 \n\t"
+ "punpcklwd $f12, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x7($9) \n\t"
+ "gsldlc1 $f20, 0x7($11) \n\t"
+ "gsldrc1 $f16, 0x0($9) \n\t"
+ "gsldrc1 $f20, 0x0($11) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f6, $f16 \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "gsldlc1 $f16, 0x7($9) \n\t"
+ "gsldlc1 $f20, 0x7($11) \n\t"
+ "gsldrc1 $f16, 0x0($9) \n\t"
+ "gsldrc1 $f20, 0x0($11) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+
+ "gsldlc1 $f16, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f14, $f16 \n\t"
+
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "daddiu $11, %[tmp], 0x70 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+ "gssqc1 $f22, $f20, 0x10($11) \n\t"
+ "gssqc1 $f6, $f4, 0x20($11) \n\t"
+ "gssqc1 $f26, $f24, 0x30($11) \n\t"
+
+ "lb $8, 0x3(%[pTC]) \n\t"
+ "lb $9, 0x2(%[pTC]) \n\t"
+ "lb $10, 0x1(%[pTC]) \n\t"
+ "lb $11, 0x0(%[pTC]) \n\t"
+
+ "and $12, $8, 0xFFFF \n\t"
+ "dmtc1 $12, $f8 \n\t"
+
+ "and $9, $9, 0xFFFF \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "mov.d $f16, $f12 \n\t"
+
+ "and $9, $10, 0xFFFF \n\t"
+ "dmtc1 $9, $f20 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "and $9, $11, 0xFFFF \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+
+ "mov.d $f4, $f8 \n\t"
+ "dmtc1 $9, $f28 \n\t"
+ "mov.d $f0, $f28 \n\t"
+
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t"
+ "psubh $f8, $f4, $f0 \n\t"
+ "psubh $f10, $f6, $f2 \n\t"
+ "gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f16, $f12, $f12 \n\t"
+ "mov.d $f18, $f16 \n\t"
+
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f12, $f12 \n\t"
+ "mov.d $f10, $f8 \n\t"
+
+ "gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
+ "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "punpckhbh $f10, $f24, $f4 \n\t"
+ "punpcklbh $f8, $f24, $f4 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+
+ "gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "punpcklbh $f8, $f28, $f4 \n\t"
+ "punpckhbh $f10, $f28, $f4 \n\t"
+ "punpcklbh $f28, $f30, $f6 \n\t"
+ "punpckhbh $f30, $f30, $f6 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "punpckhbh $f14, $f12, $f4 \n\t"
+ "punpcklbh $f12, $f12, $f4 \n\t"
+ "punpckhbh $f22, $f20, $f4 \n\t"
+ "punpcklbh $f20, $f20, $f4 \n\t"
+ "gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+
+ "dli $13, 0x4 \n\t"
+ "gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
+ "dmtc1 $13, $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "dli $12, 0x2 \n\t"
+ "dli $13, 0x3 \n\t"
+
+ "gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmfc1 %[iBeta], $f2 \n\t"
+ "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "pcmpgth $f24, $f0, $f4 \n\t"
+ "pcmpgth $f26, $f2, $f6 \n\t"
+
+ "dmtc1 $12, $f0 \n\t"
+ "dmtc1 $13, $f2 \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ "psllh $f24, $f24, $f0 \n\t"
+ "psllh $f26, $f26, $f0 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "psrah $f24, $f24, $f2 \n\t"
+ "psrah $f26, $f26, $f2 \n\t"
+ "pmaxsh $f4, $f4, $f24 \n\t"
+ "pmaxsh $f6, $f6, $f26 \n\t"
+
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "pminsh $f24, $f24, $f4 \n\t"
+ "pminsh $f26, $f26, $f6 \n\t"
+
+ "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f12 \n\t"
+ "psubh $f6, $f10, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "pcmpgth $f24, $f16, $f4 \n\t"
+ "pcmpgth $f26, $f18, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "pcmpgth $f28, $f28, $f4 \n\t"
+ "pcmpgth $f30, $f30, $f6 \n\t"
+
+ "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
+ "and $f24, $f24, $f28 \n\t"
+ "and $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f12 \n\t"
+ "psubh $f22, $f22, $f14 \n\t"
+ WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
+ "pcmpgth $f4, $f4, $f20 \n\t"
+ "pcmpgth $f6, $f6, $f22 \n\t"
+
+ "gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "and $f24, $f24, $f4 \n\t"
+ "and $f26, $f26, $f6 \n\t"
+ "gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+
+ "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t"
+
+ "dmtc1 $12, $f0 \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ "psllh $f24, $f24, $f0 \n\t"
+ "psllh $f26, $f26, $f0 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "dmtc1 %[iBeta], $f2 \n\t"
+
+ "dmtc1 $13, $f0 \n\t"
+ "gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f0 \n\t"
+ "psrah $f26, $f26, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "pmaxsh $f20, $f20, $f24 \n\t"
+ "pmaxsh $f22, $f22, $f26 \n\t"
+ "pminsh $f0, $f0, $f20 \n\t"
+ "pminsh $f2, $f2, $f22 \n\t"
+
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmfc1 %[iBeta], $f2 \n\t"
+ "gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t"
+ "psubh $f24, $f4, $f20 \n\t"
+ "psubh $f26, $f6, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f16, $f16, $f24 \n\t"
+ "pcmpgth $f18, $f18, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "dmtc1 %[iBeta], $f2 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+
+ "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "paddh $f4, $f4, $f0 \n\t"
+ "paddh $f6, $f6, $f2 \n\t"
+ "psubh $f12, $f12, $f16 \n\t"
+ "psubh $f14, $f14, $f18 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f4, $f6 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f20, $f22 \n\t"
+
+ "gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t"
+ "gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
+ "daddiu $11, %[tmp], 0x70 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0($11) \n\t"
+ "gslqc1 $f6, $f4, 0x10($11) \n\t"
+ "gslqc1 $f10, $f8, 0x20($11) \n\t"
+ "gslqc1 $f14, $f12, 0x30($11) \n\t"
+
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+
+ "dli %[iAlpha], 0x20 \n\t"
+ "daddu $8, %[pPixCb], %[iStride] \n\t"
+ "gsswlc1 $f0, 0x3(%[pPixCb]) \n\t"
+ "gsswlc1 $f20, 0x3($8) \n\t"
+ "gsswrc1 $f0, 0x0(%[pPixCb]) \n\t"
+ "gsswrc1 $f20, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($9) \n\t"
+ "gsswlc1 $f24, 0x3($8) \n\t"
+ "gsswrc1 $f4, 0x0($9) \n\t"
+ "gsswrc1 $f24, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+
+ "dsrl $f0, $f0, $f8 \n\t"
+ "dsrl $f20, $f20, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+ "dsrl $f24, $f24, $f8 \n\t"
+ "daddu $10, %[pPixCr], %[iStride] \n\t"
+ "gsswlc1 $f0, 0x3(%[pPixCr]) \n\t"
+ "gsswlc1 $f20, 0x3($10) \n\t"
+ "gsswrc1 $f0, 0x0(%[pPixCr]) \n\t"
+ "gsswrc1 $f20, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($11) \n\t"
+ "gsswlc1 $f24, 0x3($10) \n\t"
+ "gsswrc1 $f4, 0x0($11) \n\t"
+ "gsswrc1 $f24, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f2, 0x3($9) \n\t"
+ "gsswlc1 $f22, 0x3($8) \n\t"
+ "gsswrc1 $f2, 0x0($9) \n\t"
+ "gsswrc1 $f22, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($9) \n\t"
+ "gsswlc1 $f26, 0x3($8) \n\t"
+ "gsswrc1 $f6, 0x0($9) \n\t"
+ "gsswrc1 $f26, 0x0($8) \n\t"
+
+ "dsrl $f2, $f2, $f8 \n\t"
+ "dsrl $f22, $f22, $f8 \n\t"
+ "dsrl $f6, $f6, $f8 \n\t"
+ "dsrl $f26, $f26, $f8 \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f2, 0x3($11) \n\t"
+ "gsswlc1 $f22, 0x3($10) \n\t"
+ "gsswrc1 $f2, 0x0($11) \n\t"
+ "gsswrc1 $f22, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($11) \n\t"
+ "gsswlc1 $f26, 0x3($10) \n\t"
+ "gsswrc1 $f6, 0x0($11) \n\t"
+ "gsswrc1 $f26, 0x0($10) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
+ "gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
+ "pcmpeqh $f8, $f8, $f8 \n\t"
+ "dli $8, 0xF \n\t"
+ "dmtc1 $8, $f6 \n\t"
+ "psrlh $f8, $f8, $f6 \n\t"
+ "packushb $f8, $f8, $f8 \n\t"
+
+ "pminub $f0, $f0, $f8 \n\t"
+ "pminub $f2, $f2, $f8 \n\t"
+ "pminub $f4, $f4, $f8 \n\t"
+ "gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
+ "gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
+ :
+ : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+}
--- /dev/null
+++ b/codec/common/mips/expand_picture_mmi.c
@@ -1,0 +1,673 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file expand_picture_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 24/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define mov_line_8x4_mmi_aligned(r0, r1, f0) \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_8x4_mmi_unaligned(r0, r1, f0) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end8x4_mmi_aligned(r0, r1, f0) \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end8x4_mmi_unaligned(r0, r1, f0) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+
+#define mov_line_16x4_mmi_aligned(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_16x4_mmi_unaligned(r0, r1, f0, f2) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end16x4_mmi_aligned(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t"
+
+#define mov_line_end16x4_mmi_unaligned(r0, r1, f0, f2) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+
+#define exp_top_bottom_mmi_32 \
+ "dsra %[iWidth], %[iWidth], 0x4 \n\t" \
+ "1: \n\t" \
+ "gslqc1 $f2, $f0, 0x0(%[pDst]) \n\t" \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_end16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ "gslqc1 $f6, $f4, 0x0(%[iHeight]) \n\t" \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_end16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ PTR_ADDIU "%[pDst], %[pDst], 0x10 \n\t" \
+ PTR_ADDIU "$9, $9, 0x10 \n\t" \
+ PTR_ADDIU "%[iHeight], %[iHeight], 0x10 \n\t" \
+ PTR_ADDIU "$11, $11, 0x10 \n\t" \
+ "dnegu %[iStride], %[iStride] \n\t" \
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" \
+ "bnez %[iWidth], 1b \n\t" \
+ "nop \n\t"
+
+#define exp_left_right_mmi_32 \
+ "2: \n\t" \
+ "lbu %[iWidth], 0x0(%[pDst]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f28, %[iWidth]) \
+ "gssqc1 $f2, $f0, 0x0($9) \n\t" \
+ "gssqc1 $f2, $f0, 0x10($9) \n\t" \
+ "lbu %[iWidth], 0x0(%[iHeight]) \n\t" \
+ MMI_Copy16Times($f4, $f6, $f28, %[iWidth]) \
+ "gssqc1 $f6, $f4, 0x0($11) \n\t" \
+ "gssqc1 $f6, $f4, 0x10($11) \n\t" \
+ PTR_ADDU "%[pDst], %[pDst], %[iStride] \n\t" \
+ PTR_ADDU "$9, $9, %[iStride] \n\t" \
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t" \
+ PTR_ADDU "$11, $11, %[iStride] \n\t" \
+ PTR_ADDIU "$8, $8, -0x1 \n\t" \
+ "bnez $8, 2b \n\t" \
+ "nop \n\t"
+
+#define mov_line_32x4_mmi(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end32x4_mmi(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t"
+
+#define exp_cross_mmi_32 \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_end32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_end32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_end32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_end32x4_mmi($8, %[iStride], $f24, $f26)
+
+#define exp_top_bottom_mmi_16_aligned \
+ "move $8, %[iWidth] \n\t" \
+ "dsra %[iWidth], %[iWidth], 0x4 \n\t" \
+ "1: \n\t" \
+ "gslqc1 $f2, $f0, 0x0(%[pDst]) \n\t" \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_end16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ "gslqc1 $f6, $f4, 0x0(%[iHeight]) \n\t" \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_end16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ PTR_ADDIU "%[pDst], %[pDst], 0x10 \n\t" \
+ PTR_ADDIU "$9, $9, 0x10 \n\t" \
+ PTR_ADDIU "%[iHeight], %[iHeight], 0x10 \n\t" \
+ PTR_ADDIU "$11, $11, 0x10 \n\t" \
+ "dnegu %[iStride], %[iStride] \n\t" \
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" \
+ "bnez %[iWidth], 1b \n\t" \
+ "nop \n\t" \
+ "and $8, 0x0F \n\t" \
+ "beqz $8, 2f \n\t" \
+ "nop \n\t" \
+ "gsldxc1 $f0, 0x0(%[pDst], $0) \n\t" \
+ mov_line_8x4_mmi_aligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_aligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_aligned($9, %[iStride], $f0) \
+ mov_line_end8x4_mmi_aligned($9, %[iStride], $f0) \
+ "gsldxc1 $f4, 0x0(%[iHeight], $0) \n\t" \
+ mov_line_8x4_mmi_aligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_aligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_aligned($11, %[iStride], $f4) \
+ mov_line_end8x4_mmi_aligned($11, %[iStride], $f4) \
+ "2: \n\t"
+
+#define exp_top_bottom_mmi_16_unaligned \
+ "move $8, %[iWidth] \n\t" \
+ "dsra %[iWidth], %[iWidth], 0x4 \n\t" \
+ "1: \n\t" \
+ "gsldlc1 $f0, 0x7(%[pDst]) \n\t" \
+ "gsldlc1 $f2, 0xF(%[pDst]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pDst]) \n\t" \
+ "gsldrc1 $f2, 0x8(%[pDst]) \n\t" \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ mov_line_end16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ "gsldlc1 $f4, 0x7(%[iHeight]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[iHeight]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[iHeight]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[iHeight]) \n\t" \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ mov_line_end16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ PTR_ADDIU "%[pDst], %[pDst], 0x10 \n\t" \
+ PTR_ADDIU "$9, $9, 0x10 \n\t" \
+ PTR_ADDIU "%[iHeight], %[iHeight], 0x10 \n\t" \
+ PTR_ADDIU "$11, $11, 0x10 \n\t" \
+ "dnegu %[iStride], %[iStride] \n\t" \
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" \
+ "bnez %[iWidth], 1b \n\t" \
+ "nop \n\t" \
+ "and $8, 0x0F \n\t" \
+ "beqz $8, 2f \n\t" \
+ "nop \n\t" \
+ "gsldlc1 $f0, 0x7(%[pDst]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pDst]) \n\t" \
+ mov_line_8x4_mmi_unaligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_unaligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_unaligned($9, %[iStride], $f0) \
+ mov_line_end8x4_mmi_unaligned($9, %[iStride], $f0) \
+ "gsldlc1 $f4, 0x7(%[iHeight]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[iHeight]) \n\t" \
+ mov_line_8x4_mmi_unaligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_unaligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_unaligned($11, %[iStride], $f4) \
+ mov_line_end8x4_mmi_unaligned($11, %[iStride], $f4) \
+ "2: \n\t"
+
+#define exp_left_right_mmi_16_aligned \
+ "3: \n\t" \
+ "lbu %[iWidth], 0x0(%[pDst]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f28, %[iWidth]) \
+ "gssqc1 $f2, $f0, 0x0($9) \n\t" \
+ "lbu %[iWidth], 0x0(%[iHeight]) \n\t" \
+ MMI_Copy16Times($f4, $f6, $f28, %[iWidth]) \
+ "gssqc1 $f6, $f4, 0x0($11) \n\t" \
+ PTR_ADDU "%[pDst], %[pDst], %[iStride] \n\t" \
+ PTR_ADDU "$9, $9, %[iStride] \n\t" \
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t" \
+ PTR_ADDU "$11, $11, %[iStride] \n\t" \
+ PTR_ADDIU "$8, $8, -0x1 \n\t" \
+ "bnez $8, 3b \n\t" \
+ "nop \n\t"
+
+#define exp_left_right_mmi_16_unaligned \
+ "3: \n\t" \
+ "lbu %[iWidth], 0x0(%[pDst]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f28, %[iWidth]) \
+ "gssdlc1 $f0, 0x7($9) \n\t" \
+ "gssdlc1 $f2, 0xF($9) \n\t" \
+ "gssdrc1 $f0, 0x0($9) \n\t" \
+ "gssdrc1 $f2, 0x8($9) \n\t" \
+ "lbu %[iWidth], 0x0(%[iHeight]) \n\t" \
+ MMI_Copy16Times($f4, $f6, $f28, %[iWidth]) \
+ "gssdlc1 $f4, 0x7($11) \n\t" \
+ "gssdlc1 $f6, 0xF($11) \n\t" \
+ "gssdrc1 $f4, 0x0($11) \n\t" \
+ "gssdrc1 $f6, 0x8($11) \n\t" \
+ PTR_ADDU "%[pDst], %[pDst], %[iStride] \n\t" \
+ PTR_ADDU "$9, $9, %[iStride] \n\t" \
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t" \
+ PTR_ADDU "$11, $11, %[iStride] \n\t" \
+ PTR_ADDIU "$8, $8, -0x1 \n\t" \
+ "bnez $8, 3b \n\t" \
+ "nop \n\t"
+
+#define exp_cross_mmi_16_aligned \
+ mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_end16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_end16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_end16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26) \
+ mov_line_end16x4_mmi_aligned($8, %[iStride], $f24, $f26)
+
+#define exp_cross_mmi_16_unaligned \
+ mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_end16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_end16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_end16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26) \
+ mov_line_end16x4_mmi_unaligned($8, %[iStride], $f24, $f26)
+
+void ExpandPictureLuma_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+ int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "lbu $8, 0x0(%[pDst]) \n\t"
+
+ MMI_Copy16Times($f12, $f14, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDU "$9, %[pDst], %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $10, %[iHeight] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "dmul %[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[pDst] \n\t"
+
+ "move $8, %[iStride] \n\t"
+ "dsll $8, 0x5 \n\t"
+ PTR_ADDU "$11, %[iHeight], $8 \n\t"
+
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+ MMI_Copy16Times($f20, $f22, $f28, $8)
+ PTR_ADDU "$8, %[iHeight], %[iWidth] \n\t"
+ PTR_ADDIU "$8, -0x1 \n\t"
+ "lbu $8, 0x0($8) \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "pshufh $f24, $f24, $f28 \n\t"
+ "packushb $f24, $f24, $f24 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $12, %[pDst] \n\t"
+ "move $13, %[iStride] \n\t"
+ "move $14, %[iWidth] \n\t"
+ exp_top_bottom_mmi_32
+ "move %[iWidth], $14 \n\t"
+ "move %[iStride], $13 \n\t"
+ "move %[pDst], $12 \n\t"
+ PTR_ADDIU "$9, %[pDst], -0x20 \n\t"
+ PTR_ADDU "%[iHeight], %[pDst], %[iWidth] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDIU "$11, %[iHeight], 0x1 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+ MMI_Copy16Times($f16, $f18, $f28, $8)
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $8, $10 \n\t"
+ "move $10, %[pDst] \n\t"
+ "move $12, %[iStride] \n\t"
+ "move $13, %[iWidth] \n\t"
+ "move $14, $8 \n\t"
+
+ exp_left_right_mmi_32
+
+ "move $8, $14 \n\t"
+ "move %[iWidth], $13 \n\t"
+ "move %[iStride], $12 \n\t"
+ "move %[pDst], $10 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[pDst], -0x20 \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "$11, %[pDst], %[iWidth] \n\t"
+ PTR_ADDU "$11, $11, %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "dmul $8, $8, %[iStride] \n\t"
+ PTR_ADDU "$9, %[iHeight], $8 \n\t"
+ PTR_ADDU "$8, $11, $8 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ exp_cross_mmi_32
+ : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ :
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
+
+void ExpandPictureChromaUnalign_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+ int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "lbu $8, 0x0(%[pDst]) \n\t"
+
+ MMI_Copy16Times($f12, $f14, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDU "$9, %[pDst], %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $10, %[iHeight] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "dmul %[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[pDst] \n\t"
+ "move $8, %[iStride] \n\t"
+ "dsll $8, 0x4 \n\t"
+ PTR_ADDU "$11, %[iHeight], $8 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+
+ MMI_Copy16Times($f20, $f22, $f28, $8)
+
+ PTR_ADDU "$8, %[iHeight], %[iWidth] \n\t"
+ PTR_ADDIU "$8, -0x1 \n\t"
+ "lbu $8, 0x0($8) \n\t"
+
+ MMI_Copy16Times($f24, $f26, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $12, %[pDst] \n\t"
+ "move $13, %[iStride] \n\t"
+ "move $14, %[iWidth] \n\t"
+
+ exp_top_bottom_mmi_16_unaligned
+
+ "move %[iWidth], $14 \n\t"
+ "move %[iStride], $13 \n\t"
+ "move %[pDst], $12 \n\t"
+ PTR_ADDIU "$9, %[pDst], -0x10 \n\t"
+ PTR_ADDU "%[iHeight], %[pDst], %[iWidth] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDIU "$11, %[iHeight], 0x1 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+ MMI_Copy16Times($f16, $f18, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $8, $10 \n\t"
+
+ "move $10, %[pDst] \n\t"
+ "move $12, %[iStride] \n\t"
+ "move $13, %[iWidth] \n\t"
+ "move $14, $8 \n\t"
+
+ exp_left_right_mmi_16_unaligned
+
+ "move $8, $14 \n\t"
+ "move %[iWidth], $13 \n\t"
+ "move %[iStride], $12 \n\t"
+ "move %[pDst], $10 \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[pDst], -0x10 \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "$11, %[pDst], %[iWidth] \n\t"
+ PTR_ADDU "$11, $11, %[iStride] \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "dmul $8, $8, %[iStride] \n\t"
+
+ PTR_ADDU "$9, %[iHeight], $8 \n\t"
+ PTR_ADDU "$8, $11, $8 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+
+ exp_cross_mmi_16_unaligned
+ : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ :
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
+
+void ExpandPictureChromaAlign_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+ int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "lbu $8, 0x0(%[pDst]) \n\t"
+
+ MMI_Copy16Times($f12, $f14, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDU "$9, %[pDst], %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $10, %[iHeight] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "dmul %[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[pDst] \n\t"
+ "move $8, %[iStride] \n\t"
+ "dsll $8, 0x4 \n\t"
+ PTR_ADDU "$11, %[iHeight], $8 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+
+ MMI_Copy16Times($f20, $f22, $f28, $8)
+
+ PTR_ADDU "$8, %[iHeight], %[iWidth] \n\t"
+ PTR_ADDIU "$8, -0x1 \n\t"
+ "lbu $8, 0x0($8) \n\t"
+
+ MMI_Copy16Times($f24, $f26, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+
+ "move $12, %[pDst] \n\t"
+ "move $13, %[iStride] \n\t"
+ "move $14, %[iWidth] \n\t"
+ exp_top_bottom_mmi_16_aligned
+
+ "move %[iWidth], $14 \n\t"
+ "move %[iStride], $13 \n\t"
+ "move %[pDst], $12 \n\t"
+
+ PTR_ADDIU "$9, %[pDst], -0x10 \n\t"
+
+ PTR_ADDU "%[iHeight], %[pDst], %[iWidth] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDIU "$11, %[iHeight], 0x1 \n\t"
+
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+
+ MMI_Copy16Times($f16, $f18, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $8, $10 \n\t"
+
+ "move $10, %[pDst] \n\t"
+ "move $12, %[iStride] \n\t"
+ "move $13, %[iWidth] \n\t"
+ "move $14, $8 \n\t"
+
+ exp_left_right_mmi_16_aligned
+
+ "move $8, $14 \n\t"
+ "move %[iWidth], $13 \n\t"
+ "move %[iStride], $12 \n\t"
+ "move %[pDst], $10 \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[pDst], -0x10 \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "$11, %[pDst], %[iWidth] \n\t"
+ PTR_ADDU "$11, $11, %[iStride] \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "dmul $8, $8, %[iStride] \n\t"
+
+ PTR_ADDU "$9, %[iHeight], $8 \n\t"
+ PTR_ADDU "$8, $11, $8 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+
+ exp_cross_mmi_16_aligned
+ : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ :
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/common/mips/intra_pred_com_mmi.c
@@ -1,0 +1,548 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file intra_pred_com_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_PRED_H_16X16_ONE_LINE \
+ PTR_ADDIU "%[pPred], %[pPred], 0x10 \n\t" \
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
+ "lbu $8, 0x0(%[pRef]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f4, $8) \
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+#define LOAD_2_LEFT_AND_ADD \
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
+ "lbu $9, -0x1(%[pRef]) \n\t" \
+ PTR_ADDU "$8, $8, $9 \n\t" \
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
+ "lbu $9, -0x1(%[pRef]) \n\t" \
+ PTR_ADDU "$8, $8, $9 \n\t"
+
+//f2 should be mmi_01bytes, f4 should be 0x38, f6 should be 0x0
+#define MMI_PRED_H_8X8_ONE_LINE(f0, f2, f4, f6, r0, r1, r1_offset) \
+ PTR_ADDU ""#r0", "#r0", %[kiStride] \n\t" \
+ "gsldxc1 "#f0", -0x8("#r0", $0) \n\t" \
+ "dsrl "#f0", "#f0", "#f4" \n\t" \
+ "pmullh "#f0", "#f0", "#f2" \n\t" \
+ "pshufh "#f0", "#f0", "#f6" \n\t" \
+ "gssdxc1 "#f0", "#r1_offset"+0x0("#r1", $0) \n\t"
+
+void WelsI16x16LumaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pRef]) \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x80(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x90(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xa0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xb0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xc0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xd0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xe0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xf0(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$f0", "$f2"
+ );
+}
+
+void WelsI16x16LumaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
+ "lbu $8, 0x0(%[pRef]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ MMI_Copy16Times($f0, $f2, $f4, $8)
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$8", "$f0", "$f2", "$f4"
+ );
+}
+
+void WelsI16x16LumaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pRef]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f4 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+
+ "dli $10, 0x5 \n\t"
+ "dmtc1 $10, $f6 \n\t"
+ PTR_ADDIU "$8, 0x10 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "psrlw $f0, $f0, $f6 \n\t"
+ "gsldxc1 $f6, 0x0(%[mmi_01bytes], $0) \n\t"
+ "pmuluw $f0, $f0, $f6 \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x80(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x90(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xa0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xb0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xc0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xd0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xe0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xf0(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+}
+
+void WelsI16x16LumaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4,
+ -3, -2, -1, 0};
+ short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
+ short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pRef]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pRef]) \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_dec]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0x10(%[pRef]) \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "pmullh $f2, $f2, $f22 \n\t"
+ "gsldrc1 $f4, 0x9(%[pRef]) \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[mmi_plane_inc]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "pmullh $f6, $f6, $f26 \n\t"
+ "psubh $f4, $f4, $f0 \n\t"
+ "psubh $f6, $f6, $f2 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+ "dmfc1 $8, $f4 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x5 \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "sra $8, $8, 0x6 \n\t"
+ MMI_Copy8Times($f4, $f6, $f28, $8)
+
+ "lbu $9, 0x10(%[pRef]) \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16,
+ $f18, %[pRef], %[kiStride], $11)
+
+ PTR_ADDIU "%[pRef], %[pRef], 0x3 \n\t"
+ "dsll $10, %[kiStride], 0x3 \n\t"
+ PTR_ADDU "$10, $10, %[pRef] \n\t"
+ "lbu $8, 0x0($10) \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ "dsll $9, $9, 0x4 \n\t"
+
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16,
+ $f18, %[pRef], %[kiStride], $11)
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "punpcklbh $f0, $f2, $f18 \n\t"
+ "punpckhbh $f2, $f2, $f18 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "pmullh $f2, $f2, $f22 \n\t"
+ "punpcklbh $f28, $f30, $f18 \n\t"
+ "punpckhbh $f30, $f30, $f18 \n\t"
+ "pmullh $f28, $f28, $f24 \n\t"
+ "pmullh $f30, $f30, $f26 \n\t"
+ "psubh $f28, $f28, $f0 \n\t"
+ "psubh $f30, $f30, $f2 \n\t"
+
+ SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+ "dmfc1 $8, $f28 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x5 \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "sra $8, $8, 0x6 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ MMI_Copy8Times($f16, $f18, $f20, $8)
+
+ PTR_ADDIU "$9, $9, 0x10 \n\t"
+ "mul $8, $8, -0x7 \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ MMI_Copy8Times($f0, $f2, $f20, $8)
+
+ "xor $8, $8, $8 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_inc_minus]) \n\t"
+
+ "dli $10, 0x5 \n\t"
+ "dmtc1 $10, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "1: \n\t"
+ "pmullh $f8, $f4, $f20 \n\t"
+ "pmullh $f10, $f6, $f22 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "psrah $f8, $f8, $f30 \n\t"
+ "psrah $f10, $f10, $f30 \n\t"
+ "pmullh $f12, $f4, $f24 \n\t"
+ "pmullh $f14, $f6, $f26 \n\t"
+ "paddh $f12, $f12, $f0 \n\t"
+ "paddh $f14, $f14, $f2 \n\t"
+ "psrah $f12, $f12, $f30 \n\t"
+ "psrah $f14, $f14, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f12, $f14 \n\t"
+ "gssqc1 $f10, $f8, 0x0(%[pPred]) \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], 0x10 \n\t"
+ PTR_ADDIU "$8, $8, 0x1 \n\t"
+ PTR_ADDIU "$10, $8, -0x10 \n\t"
+ "bnez $10, 1b \n\t"
+ "nop \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
+ [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsIChromaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
+ short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
+ short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0,
+ 1, 2, 3, 4};
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pRef]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pRef]) \n\t"
+ "gsldxc1 $f20, 0x0(%[mmi_plane_dec_c], $0) \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0xc(%[pRef]) \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "gsldrc1 $f4, 0x5(%[pRef]) \n\t"
+ "gsldxc1 $f24, 0x0(%[mmi_plane_inc_c], $0) \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "psubh $f4, $f4, $f0 \n\t"
+
+ "xor $f6, $f6, $f6 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+ "dmfc1 $8, $f4 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x11 \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "sra $8, $8, 0x5 \n\t"
+ MMI_Copy8Times($f4, $f6, $f28, $8)
+
+ "lbu $8, 0x8(%[pRef]) \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
+
+ PTR_ADDIU "%[pRef], %[pRef], 0x3 \n\t"
+ "dsll $10, %[kiStride], 0x2 \n\t"
+ PTR_ADDU "$10, $10, %[pRef] \n\t"
+ "lbu $9, 0x0($10) \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ "dsll $9, $9, 0x4 \n\t"
+
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
+ "xor $f16, $f16, $f16 \n\t"
+ "punpckhbh $f0, $f0, $f16 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "punpckhbh $f28, $f28, $f16 \n\t"
+ "pmullh $f28, $f28, $f24 \n\t"
+ "psubh $f28, $f28, $f0 \n\t"
+
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+ "dmfc1 $8, $f28 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x11 \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "sra $8, $8, 0x5 \n\t"
+ MMI_Copy8Times($f16, $f18, $f8, $8)
+
+ PTR_ADDIU "$9, $9, 0x10 \n\t"
+ "mul $8, $8, -0x3 \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ MMI_Copy8Times($f0, $f2, $f8, $8)
+
+ "xor $8, $8, $8 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_mul_b_c]) \n\t"
+
+ "dli $10, 0x5 \n\t"
+ "dmtc1 $10, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+ "1: \n\t"
+ "pmullh $f8, $f4, $f20 \n\t"
+ "pmullh $f10, $f6, $f22 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "psrah $f8, $f8, $f30 \n\t"
+ "psrah $f10, $f10, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "gssdxc1 $f8, 0x0(%[pPred], $0) \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], 0x8 \n\t"
+ PTR_ADDIU "$8, $8, 0x1 \n\t"
+ PTR_ADDIU "$10, $8, -0x8 \n\t"
+ "bnez $10, 1b \n\t"
+ "nop \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
+ [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsIChromaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gsldxc1 $f0, 0x0(%[pRef], $0) \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$f0", "$f2"
+ );
+}
+
+void WelsIChromaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ short mmi_0x02[4]__attribute__((aligned(16))) = {2, 0, 0, 0};
+ unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gsldxc1 $f0, 0x0(%[pRef], $0) \n\t"
+
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "dmtc1 $8, $f2 \n\t"
+
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ "punpcklwd $f6, $f0, $f8 \n\t"
+ "punpckhwd $f0, $f0, $f8 \n\t"
+ "pasubub $f0, $f0, $f8 \n\t"
+ "pasubub $f6, $f6, $f8 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f6, $f6 \n\t"
+
+ "dadd $f6, $f6, $f2 \n\t"
+ "dadd $f2, $f4, $f0 \n\t"
+
+ "gsldxc1 $f8, 0x0(%[mmi_0x02], $0) \n\t"
+
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f10 \n\t"
+ "dadd $f0, $f0, $f8 \n\t"
+ "dsrl $f0, $f0, $f10 \n\t"
+
+ "dadd $f4, $f4, $f8 \n\t"
+ "dsrl $f4, $f4, $f10 \n\t"
+
+ "dli $10, 0x3 \n\t"
+ "dmtc1 $10, $f10 \n\t"
+ "dadd $f6, $f6, $f8 \n\t"
+ "dadd $f6, $f6, $f8 \n\t"
+ "dsrl $f6, $f6, $f10 \n\t"
+
+ "dadd $f2, $f2, $f8 \n\t"
+ "dadd $f2, $f2, $f8 \n\t"
+ "dsrl $f2, $f2, $f10 \n\t"
+
+ "dli $10, 0x20 \n\t"
+ "dmtc1 $10, $f10 \n\t"
+ "gsldxc1 $f12, 0x0(%[mmi_01bytes], $0) \n\t"
+ "pmuluw $f0, $f0, $f12 \n\t"
+ "pmuluw $f6, $f6, $f12 \n\t"
+ "dsll $f0, $f0, $f10 \n\t"
+ "xor $f0, $f0, $f6 \n\t"
+
+ "pmuluw $f4, $f4, $f12 \n\t"
+ "pmuluw $f2, $f2, $f12 \n\t"
+ "dsll $f2, $f2, $f10 \n\t"
+ "xor $f2, $f2, $f4 \n\t"
+
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ "gssdxc1 $f0, 0x8(%[pPred], $0) \n\t"
+ "gssdxc1 $f0, 0x10(%[pPred], $0) \n\t"
+ "gssdxc1 $f0, 0x18(%[pPred], $0) \n\t"
+
+ "gssdxc1 $f2, 0x20(%[pPred], $0) \n\t"
+ "gssdxc1 $f2, 0x28(%[pPred], $0) \n\t"
+ "gssdxc1 $f2, 0x30(%[pPred], $0) \n\t"
+ "gssdxc1 $f2, 0x38(%[pPred], $0) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes),
+ [mmi_0x02]"r"((unsigned char *)mmi_0x02)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
+ );
+}
+
+void WelsIChromaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldxc1 $f2, 0x0(%[mmi_01bytes], $0) \n\t"
+ "dli $8, 0x38 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "gsldxc1 $f0, -0x8(%[pRef], $0) \n\t"
+ "dsrl $f0, $f0, $f4 \n\t"
+
+ "pmullh $f0, $f0, $f2 \n\t"
+ "pshufh $f0, $f0, $f6 \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x8)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x10)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x18)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x20)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x28)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x30)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x38)
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+}
--- /dev/null
+++ b/codec/common/mips/satd_sad_mmi.c
@@ -1,0 +1,2154 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file satd_sad_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_SumWHorizon1(f0, f2, f4, f6, f8, f10, r0) \
+ "dli "#r0", 0x10 \n\t" \
+ "dmtc1 "#r0", "#f8" \n\t" \
+ "dli "#r0", 0x20 \n\t" \
+ "dmtc1 "#r0", "#f10" \n\t" \
+ "mov.d "#f4", "#f2" \n\t" \
+ "xor "#f6", "#f6", "#f6" \n\t" \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t" \
+ "dsrl "#f6", "#f2", "#f10" \n\t" \
+ "punpcklwd "#f4", "#f2", "#f2" \n\t" \
+ "punpckhwd "#f4", "#f0", "#f4" \n\t" \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t" \
+ "dsrl "#f4", "#f0", "#f8" \n\t" \
+ "pinsrh_3 "#f4", "#f4", "#f2" \n\t" \
+ "dsrl "#f6", "#f2", "#f8" \n\t" \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_GetSad8x4 \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f4, 0x7($8) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f4, 0x0($8) \n\t" \
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f6, 0x7($8) \n\t" \
+ "gsldlc1 $f8, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f6, 0x0($8) \n\t" \
+ "gsldrc1 $f8, 0x0(%[pSample2]) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
+ "gsldlc1 $f12, 0x7($9) \n\t" \
+ "gsldlc1 $f10, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f12, 0x0($9) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ "gsldrc1 $f10, 0x0(%[pSample2]) \n\t" \
+ "gsldlc1 $f14, 0x7($9) \n\t" \
+ "gsldrc1 $f14, 0x0($9) \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "pasubub $f4, $f4, $f12 \n\t" \
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
+ "pasubub $f6, $f6, $f14 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f24, $f24, $f0 \n\t" \
+ "paddh $f26, $f26, $f2 \n\t" \
+ "paddh $f24, $f24, $f4 \n\t" \
+ "paddh $f26, $f26, $f6 \n\t"
+
+#define MMI_GetSad8x4_End \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f4, 0x7($8) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f4, 0x0($8) \n\t" \
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f6, 0x7($8) \n\t" \
+ "gsldlc1 $f8, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f6, 0x0($8) \n\t" \
+ "gsldrc1 $f8, 0x0(%[pSample2]) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
+ "gsldlc1 $f12, 0x7($9) \n\t" \
+ "gsldlc1 $f10, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f12, 0x0($9) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ "gsldrc1 $f10, 0x0(%[pSample2]) \n\t" \
+ "gsldlc1 $f14, 0x7($9) \n\t" \
+ "gsldrc1 $f14, 0x0($9) \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "pasubub $f4, $f4, $f12 \n\t" \
+ "pasubub $f6, $f6, $f14 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f24, $f24, $f0 \n\t" \
+ "paddh $f26, $f26, $f2 \n\t" \
+ "paddh $f24, $f24, $f4 \n\t" \
+ "paddh $f26, $f26, $f6 \n\t"
+
+#define CACHE_SPLIT_CHECK(r0, width, cacheline) \
+ "and "#r0", "#r0", 0x1f \n\t" \
+ PTR_ADDIU ""#r0", "#r0", -0x1f \n\t"
+
+#define MMI_GetSad2x16 \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f0, $f0, $f4 \n\t" \
+ "paddh $f2, $f2, $f6 \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f0, $f0, $f4 \n\t" \
+ "paddh $f2, $f2, $f6 \n\t"
+
+#define MMI_GetSad4x16 \
+ "gsldlc1 $f0, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f2, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_GetSad4x16_Aligned \
+ "gslqc1 $f2, $f0, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_GetSad4x16_End \
+ "gsldlc1 $f0, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f2, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_GetSad4x16_Aligned_End \
+ "gslqc1 $f2, $f0, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_Get4LW16Sad(f0, f2, f4, f6, f8, f10, f12, f14, r0) \
+ "pasubub "#f0", "#f0", "#f12" \n\t" \
+ "pasubub "#f2", "#f2", "#f14" \n\t" \
+ "pasubub "#f12", "#f12", "#f8" \n\t" \
+ "pasubub "#f14", "#f14", "#f10" \n\t" \
+ "biadd "#f0", "#f0" \n\t" \
+ "biadd "#f2", "#f2" \n\t" \
+ "biadd "#f12", "#f12" \n\t" \
+ "biadd "#f14", "#f14" \n\t" \
+ "paddh $f20, $f20, "#f0" \n\t" \
+ "paddh $f22, $f22, "#f2" \n\t" \
+ "paddh $f16, $f16, "#f12" \n\t" \
+ "paddh $f18, $f18, "#f14" \n\t" \
+ "gsldlc1 "#f12", 0x6("#r0") \n\t" \
+ "gsldlc1 "#f14", 0xE("#r0") \n\t" \
+ "gsldrc1 "#f12", -0x1("#r0") \n\t" \
+ "gsldrc1 "#f14", 0x7("#r0") \n\t" \
+ "pasubub "#f12", "#f12", "#f4" \n\t" \
+ "pasubub "#f14", "#f14", "#f6" \n\t" \
+ "biadd "#f12", "#f12" \n\t" \
+ "biadd "#f14", "#f14" \n\t" \
+ "paddh $f24, $f24, "#f12" \n\t" \
+ "paddh $f26, $f26, "#f14" \n\t" \
+ "gsldlc1 "#f12", 0x8("#r0") \n\t" \
+ "gsldlc1 "#f14", 0x10("#r0") \n\t" \
+ "gsldrc1 "#f12", 0x1("#r0") \n\t" \
+ "gsldrc1 "#f14", 0x9("#r0") \n\t" \
+ "pasubub "#f12", "#f12", "#f4" \n\t" \
+ "pasubub "#f14", "#f14", "#f6" \n\t" \
+ "biadd "#f12", "#f12" \n\t" \
+ "biadd "#f14", "#f14" \n\t" \
+ "paddh $f28, $f28, "#f12" \n\t" \
+ "paddh $f30, $f30, "#f14" \n\t"
+
+#define MMI_HDMTwo4x4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_SumSub(f0, f2, f4, f6, f16, f18) \
+ MMI_SumSub(f8, f10, f12, f14, f16, f18) \
+ MMI_SumSub(f4, f6, f12, f14, f16, f18) \
+ MMI_SumSub(f0, f2, f8, f10, f16, f18)
+
+#define MMI_SumAbs4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26) \
+ WELS_AbsH(f0, f2, f0, f2, f8, f10) \
+ WELS_AbsH(f4, f6, f4, f6, f8, f10) \
+ WELS_AbsH(f12, f14, f12, f14, f20, f22) \
+ WELS_AbsH(f16, f18, f16, f18, f20, f22) \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t" \
+ "paddush "#f12", "#f12", "#f16" \n\t" \
+ "paddush "#f14", "#f14", "#f18" \n\t" \
+ "paddush "#f24", "#f24", "#f0" \n\t" \
+ "paddush "#f26", "#f26", "#f2" \n\t" \
+ "paddush "#f24", "#f24", "#f12" \n\t" \
+ "paddush "#f26", "#f26", "#f14" \n\t"
+
+#define MMI_SumWHorizon(f0, f2, f4, f6, f8, f10) \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t" \
+ "pshufh "#f2", "#f0", "#f10" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_LoadDiff8P_Offset_Stride0(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ PTR_ADDU "$11, %[pSample1], %[iStride1] \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ PTR_ADDU "$12, %[pSample2], %[iStride2] \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_LoadDiff8P_Offset_Stride1(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ PTR_ADDU "%[pSample1], $11, %[iStride1] \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ PTR_ADDU "%[pSample2], $12, %[iStride2] \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_LoadDiff8P_Offset8(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ PTR_ADDU "%[pSample1], $9, 0x8 \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ PTR_ADDU "%[pSample2], $10, 0x8 \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_GetSatd8x8 \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+#define MMI_GetSatd8x8_Offset8 \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset8($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+#define MMI_GetSatd8x8_End \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+int32_t WelsSampleSad16x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "and $8, %[pSample2], 0xF \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "bnez $8, unaligned \n\t"
+ "aligned: \n\t"
+ MMI_GetSad4x16_Aligned
+ MMI_GetSad4x16_Aligned
+ MMI_GetSad4x16_Aligned
+ MMI_GetSad4x16_Aligned_End
+ "b out \n\t"
+
+ "unaligned: \n\t"
+ MMI_GetSad4x16
+ MMI_GetSad4x16
+ MMI_GetSad4x16
+ MMI_GetSad4x16_End
+ "out: \n\t"
+ "mov.d $f0, $f30 \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSad16x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pSample2]) \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "pasubub $f0, $f0, $f8 \n\t"
+ "pasubub $f2, $f2, $f10 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "pasubub $f4, $f4, $f8 \n\t"
+ "pasubub $f6, $f6, $f10 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ MMI_GetSad2x16
+ MMI_GetSad2x16
+ MMI_GetSad2x16
+
+ "paddh $f0, $f0, $f2 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSad8x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ MMI_GetSad8x4
+ MMI_GetSad8x4
+ MMI_GetSad8x4
+ MMI_GetSad8x4_End
+ "paddh $f0, $f26, $f24 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSad4x4_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "punpcklwd $f0, $f0, $f2 \n\t"
+
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "punpcklwd $f6, $f6, $f8 \n\t"
+ "pasubub $f0, $f0, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "punpcklwd $f2, $f2, $f4 \n\t"
+
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "punpcklwd $f6, $f6, $f8 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+ return iSadSum;
+}
+
+int32_t WelsSampleSad8x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ CACHE_SPLIT_CHECK($8, 8, 32)
+ "blez $8, 1f \n\t"
+ "nop \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+
+ "move $9, %[pSample2] \n\t"
+ "and $9, $9, 0x7 \n\t"
+ PTR_SUBU "%[pSample2], %[pSample2], $9 \n\t"
+ "dli $8, 0x8 \n\t"
+ PTR_SUBU "$8, $8, $9 \n\t"
+
+ "dsll $9, $9, 0x3 \n\t"
+ "dsll $8, $8, 0x3 \n\t"
+ "dmtc1 $9, $f20 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $9, 0x8 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ PTR_ADDU "$9, $9, %[pSample2] \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+
+ "mov.d $f0, $f30 \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ "j 2f \n\t"
+ "nop \n\t"
+
+ "1: \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ MMI_GetSad8x4
+ MMI_GetSad8x4_End
+ "paddh $f0, $f26, $f24 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ "2: \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSatd4x4_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "punpcklwd $f0, $f0, $f8 \n\t"
+ "punpcklwd $f4, $f4, $f12 \n\t"
+
+ PTR_ADDU "$8, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f16, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f20, 0x7($8) \n\t"
+ "gsldrc1 $f16, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f20, 0x0($8) \n\t"
+ PTR_ADDU "%[pSample2], $8, %[iStride2] \n\t"
+ PTR_ADDU "$8, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f24, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f28, 0x7($8) \n\t"
+ "gsldrc1 $f24, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f28, 0x0($8) \n\t"
+ "punpcklwd $f16, $f16, $f24 \n\t"
+ "punpcklwd $f20, $f20, $f28 \n\t"
+
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "punpckhbh $f2, $f0, $f24 \n\t"
+ "punpcklbh $f0, $f0, $f24 \n\t"
+ "punpckhbh $f6, $f4, $f24 \n\t"
+ "punpcklbh $f4, $f4, $f24 \n\t"
+ "punpckhbh $f18, $f16, $f24 \n\t"
+ "punpcklbh $f16, $f16, $f24 \n\t"
+ "punpckhbh $f22, $f20, $f24 \n\t"
+ "punpcklbh $f20, $f20, $f24 \n\t"
+
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+ "psubh $f4, $f4, $f20 \n\t"
+ "psubh $f6, $f6, $f22 \n\t"
+
+ "mov.d $f8, $f0 \n\t"
+ "mov.d $f10, $f2 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "psubh $f8, $f8, $f4 \n\t"
+ "psubh $f10, $f10, $f6 \n\t"
+ MMI_XSawp_DQ($f0, $f2, $f8, $f10, $f12, $f14)
+
+ "mov.d $f16, $f0 \n\t"
+ "mov.d $f18, $f2 \n\t"
+ "paddh $f0, $f0, $f12 \n\t"
+ "paddh $f2, $f2, $f14 \n\t"
+ "psubh $f16, $f16, $f12 \n\t"
+ "psubh $f18, $f18, $f14 \n\t"
+
+ "mov.d $f8, $f2 \n\t"
+ "punpckhhw $f2, $f0, $f16 \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "punpcklhw $f16, $f18, $f8 \n\t"
+ "punpckhhw $f18, $f18, $f8 \n\t"
+
+ MMI_XSawp_WD($f0, $f2, $f16, $f18, $f12, $f14)
+ MMI_XSawp_DQ($f0, $f2, $f12, $f14, $f20, $f22)
+
+ "mov.d $f28, $f0 \n\t"
+ "mov.d $f30, $f2 \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+
+ MMI_XSawp_DQ($f0, $f2, $f28, $f30, $f4, $f6)
+
+ "psubh $f8, $f0, $f4 \n\t"
+ "psubh $f10, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ WELS_AbsH($f0, $f2, $f0, $f2, $f12, $f14)
+ "paddush $f24, $f24, $f0 \n\t"
+ "paddush $f26, $f26, $f2 \n\t"
+ WELS_AbsH($f8, $f10, $f8, $f10, $f16, $f18)
+ "paddush $f24, $f24, $f8 \n\t"
+ "paddush $f26, $f26, $f10 \n\t"
+ MMI_SumWHorizon1($f24, $f26, $f16, $f18, $f28, $f30, $8)
+
+ "dmfc1 $8, $f24 \n\t"
+ "dli $9, 0xffff \n\t"
+ "and $8, $8, $9 \n\t"
+ "dsrl %[iSatdSum], $8, 0x1 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd8x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_GetSatd8x8_End
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dli $8, 0x4e \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd8x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_GetSatd8x8
+ MMI_GetSatd8x8_End
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dli $8, 0x4e \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd16x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "move $9, %[pSample1] \n\t"
+ "move $10, %[pSample2] \n\t"
+ MMI_GetSatd8x8_Offset8
+
+ MMI_GetSatd8x8_End
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dli $8, 0x4e \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
+ "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd16x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "move $9, %[pSample1] \n\t"
+ "move $10, %[pSample2] \n\t"
+
+ MMI_GetSatd8x8
+ MMI_GetSatd8x8_Offset8
+
+ MMI_GetSatd8x8
+ MMI_GetSatd8x8_End
+
+ "dli $8, 0x4e \n\t"
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dmtc1 $8, $f0 \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f0)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
+ "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+void WelsSampleSadFour16x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ PTR_SUBU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f4 \n\t"
+ "pasubub $f14, $f14, $f6 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f8, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0xE(%[pSample2]) \n\t"
+ "gsldrc1 $f8, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x7(%[pSample2]) \n\t"
+ "pasubub $f8, $f8, $f0 \n\t"
+ "pasubub $f10, $f10, $f2 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0x10(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x9(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ "pasubub $f8, $f8, $f12 \n\t"
+ "pasubub $f10, $f10, $f14 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+
+ "gsldlc1 $f8, 0x6($9) \n\t"
+ "gsldlc1 $f10, 0xE($9) \n\t"
+ "gsldrc1 $f8, -0x1($9) \n\t"
+ "gsldrc1 $f10, 0x7($9) \n\t"
+ "pasubub $f8, $f8, $f0 \n\t"
+ "pasubub $f10, $f10, $f2 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+
+ "gsldlc1 $f12, 0x8($9) \n\t"
+ "gsldlc1 $f14, 0x10($9) \n\t"
+ "gsldrc1 $f12, 0x1($9) \n\t"
+ "gsldrc1 $f14, 0x9($9) \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsSampleSadFour16x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ PTR_SUBU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f4 \n\t"
+ "pasubub $f14, $f14, $f6 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f8, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0xE(%[pSample2]) \n\t"
+ "gsldrc1 $f8, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x7(%[pSample2]) \n\t"
+ "pasubub $f8, $f8, $f0 \n\t"
+ "pasubub $f10, $f10, $f2 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0x10(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x9(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x6($9) \n\t"
+ "gsldlc1 $f2, 0xE($9) \n\t"
+ "gsldrc1 $f0, -0x1($9) \n\t"
+ "gsldrc1 $f2, 0x7($9) \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+
+ "gsldlc1 $f12, 0x8($9) \n\t"
+ "gsldlc1 $f14, 0x10($9) \n\t"
+ "gsldrc1 $f12, 0x1($9) \n\t"
+ "gsldrc1 $f14, 0x9($9) \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f4 \n\t"
+ "pasubub $f14, $f14, $f6 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f4, $f4, $f12 \n\t"
+ "pasubub $f6, $f6, $f14 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f20, $f20, $f4 \n\t"
+ "paddh $f22, $f22, $f6 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsSampleSadFour8x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_SUBU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsSampleSadFour8x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_SUBU "$9, %[pSample2], %[iStride2] \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
+ );
+ RECOVER_REG;
+}
--- a/codec/common/src/cpu.cpp
+++ b/codec/common/src/cpu.cpp
@@ -307,7 +307,17 @@
WELS_CPU_NEON;
}
-#else /* Neither X86_ASM, HAVE_NEON nor HAVE_NEON_AARCH64 */
+#elif defined(mips)
+/* for loongson */
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+#if defined(HAVE_MMI)
+ return WELS_CPU_MMI;
+#else
+ return 0;
+#endif
+}
+
+#else /* Neither X86_ASM, HAVE_NEON, HAVE_NEON_AARCH64 nor mips */
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
return 0;
--- a/codec/common/src/deblocking_common.cpp
+++ b/codec/common/src/deblocking_common.cpp
@@ -274,3 +274,22 @@
#endif
+#ifdef HAVE_MMI
+extern "C" {
+ void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
+
+ DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ DeblockLumaLt4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+ DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ }
+
+ void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
+
+ DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ DeblockLumaEq4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+ DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ }
+}
+#endif//HAVE_MMI
--- a/codec/common/src/expand_pic.cpp
+++ b/codec/common/src/expand_pic.cpp
@@ -140,6 +140,13 @@
pExpandPicFunc->pfExpandChromaPicture[1] = ExpandPictureChroma_AArch64_neon;
}
#endif//HAVE_NEON_AARCH64
+#if defined(HAVE_MMI)
+ if (kuiCPUFlag & WELS_CPU_MMI) {
+ pExpandPicFunc->pfExpandLumaPicture = ExpandPictureLuma_mmi;
+ pExpandPicFunc->pfExpandChromaPicture[0] = ExpandPictureChromaUnalign_mmi;
+ pExpandPicFunc->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_mmi;
+ }
+#endif//HAVE_MMI
}
--- a/codec/common/src/mc.cpp
+++ b/codec/common/src/mc.cpp
@@ -43,6 +43,7 @@
#include "cpu_core.h"
#include "ls_defines.h"
#include "macros.h"
+#include "asmdefs_mmi.h"
namespace {
@@ -1659,6 +1660,2541 @@
}
#endif
+#if defined(HAVE_MMI)
+#define MMI_LOAD_8P(f0, f2, f4, r0) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "punpckhbh "#f2", "#f0", "#f4" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t"
+
+#define FILTER_HV_W4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+ f20, f22, f24, f26, f28, f30, r0, r1, r2) \
+ "paddh "#f0", "#f0", "#f20" \n\t" \
+ "paddh "#f2", "#f2", "#f22" \n\t" \
+ "mov.d "#f28", "#f8" \n\t" \
+ "mov.d "#f30", "#f10" \n\t" \
+ "mov.d "#f24", "#f4" \n\t" \
+ "mov.d "#f26", "#f6" \n\t" \
+ "dmfc1 "#r2", "#f8" \n\t" \
+ "dli "#r1", 0x0010001000100010 \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "paddh "#f0", "#f0", "#f8" \n\t" \
+ "paddh "#f2", "#f2", "#f8" \n\t" \
+ "paddh "#f28", "#f28", "#f12" \n\t" \
+ "paddh "#f30", "#f30", "#f14" \n\t" \
+ "paddh "#f24", "#f24", "#f16" \n\t" \
+ "paddh "#f26", "#f26", "#f18" \n\t" \
+ "dli "#r1", 0x2 \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "psllh "#f28", "#f28", "#f8" \n\t" \
+ "psllh "#f30", "#f30", "#f8" \n\t" \
+ "psubh "#f28", "#f28", "#f24" \n\t" \
+ "psubh "#f30", "#f30", "#f26" \n\t" \
+ "paddh "#f0", "#f0", "#f28" \n\t" \
+ "paddh "#f2", "#f2", "#f30" \n\t" \
+ "psllh "#f28", "#f28", "#f8" \n\t" \
+ "psllh "#f30", "#f30", "#f8" \n\t" \
+ "paddh "#f0", "#f0", "#f28" \n\t" \
+ "paddh "#f2", "#f2", "#f30" \n\t" \
+ "dli "#r1", 0x5 \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "psrah "#f2", "#f2", "#f8" \n\t" \
+ "xor "#f28", "#f28", "#f28" \n\t" \
+ "packushb "#f0", "#f0", "#f2" \n\t" \
+ "gsswlc1 "#f0", 0x3("#r0") \n\t" \
+ "gsswrc1 "#f0", 0x0("#r0") \n\t" \
+ "dmtc1 "#r2", "#f8" \n\t"
+
+#define FILTER_HV_W8(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+ f20, f22, f24, f26, f28, f30, r0, r1, r2) \
+ "paddh "#f0", "#f0", "#f20" \n\t" \
+ "paddh "#f2", "#f2", "#f22" \n\t" \
+ "mov.d "#f28", "#f8" \n\t" \
+ "mov.d "#f30", "#f10" \n\t" \
+ "mov.d "#f24", "#f4" \n\t" \
+ "mov.d "#f26", "#f6" \n\t" \
+ "dmfc1 "#r2", "#f8" \n\t" \
+ "dli "#r1", 0x0010001000100010 \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "paddh "#f0", "#f0", "#f8" \n\t" \
+ "paddh "#f2", "#f2", "#f8" \n\t" \
+ "paddh "#f28", "#f28", "#f12" \n\t" \
+ "paddh "#f30", "#f30", "#f14" \n\t" \
+ "paddh "#f24", "#f24", "#f16" \n\t" \
+ "paddh "#f26", "#f26", "#f18" \n\t" \
+ "dli "#r1", 0x2 \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "psllh "#f28", "#f28", "#f8" \n\t" \
+ "psllh "#f30", "#f30", "#f8" \n\t" \
+ "psubh "#f28", "#f28", "#f24" \n\t" \
+ "psubh "#f30", "#f30", "#f26" \n\t" \
+ "paddh "#f0", "#f0", "#f28" \n\t" \
+ "paddh "#f2", "#f2", "#f30" \n\t" \
+ "psllh "#f28", "#f28", "#f8" \n\t" \
+ "psllh "#f30", "#f30", "#f8" \n\t" \
+ "paddh "#f0", "#f0", "#f28" \n\t" \
+ "paddh "#f2", "#f2", "#f30" \n\t" \
+ "dli "#r1", 0x5 \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "psrah "#f2", "#f2", "#f8" \n\t" \
+ "xor "#f28", "#f28", "#f28" \n\t" \
+ "packushb "#f0", "#f0", "#f2" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "dmtc1 "#r2", "#f8" \n\t"
+
+#define FILTER_VER_ALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+ f20, f22, f24, f26, f28, f30, r0, r1, r2, r3, r4) \
+ "paddh "#f0", "#f0", "#f20" \n\t" \
+ "paddh "#f2", "#f2", "#f22" \n\t" \
+ "mov.d "#f24", "#f4" \n\t" \
+ "mov.d "#f26", "#f6" \n\t" \
+ "mov.d "#f28", "#f8" \n\t" \
+ "mov.d "#f30", "#f10" \n\t" \
+ "dli "#r2", 0x2 \n\t" \
+ "paddh "#f24", "#f24", "#f16" \n\t" \
+ "paddh "#f26", "#f26", "#f18" \n\t" \
+ "dmfc1 "#r3", "#f8" \n\t" \
+ "paddh "#f28", "#f28", "#f12" \n\t" \
+ "paddh "#f30", "#f30", "#f14" \n\t" \
+ "dmtc1 "#r2", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f24" \n\t" \
+ "psubh "#f2", "#f2", "#f26" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "psrah "#f2", "#f2", "#f8" \n\t" \
+ "paddh "#f0", "#f0", "#f28" \n\t" \
+ "paddh "#f2", "#f2", "#f30" \n\t" \
+ "psubh "#f0", "#f0", "#f24" \n\t" \
+ "psubh "#f2", "#f2", "#f26" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "psrah "#f2", "#f2", "#f8" \n\t" \
+ "dmtc1 "#r4", "#f8" \n\t" \
+ "paddh "#f28", "#f28", "#f0" \n\t" \
+ "paddh "#f30", "#f30", "#f2" \n\t" \
+ "dli "#r2", 0x6 \n\t" \
+ "paddh "#f28", "#f28", "#f8" \n\t" \
+ "paddh "#f30", "#f30", "#f8" \n\t" \
+ "dmtc1 "#r2", "#f8" \n\t" \
+ "psrah "#f28", "#f28", "#f8" \n\t" \
+ "psrah "#f30", "#f30", "#f8" \n\t" \
+ "packushb "#f28", "#f28", "#f30" \n\t" \
+ "gssdxc1 "#f28", 0x0("#r0", "#r1") \n\t" \
+ "dmtc1 "#r3", "#f8" \n\t"
+
+#define FILTER_VER_UNALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \
+ f20, f22, f24, f26, f28, f30, r0, r1, r2, r3) \
+ "paddh "#f0", "#f0", "#f20" \n\t" \
+ "paddh "#f2", "#f2", "#f22" \n\t" \
+ "mov.d "#f24", "#f4" \n\t" \
+ "mov.d "#f26", "#f6" \n\t" \
+ "mov.d "#f28", "#f8" \n\t" \
+ "mov.d "#f30", "#f10" \n\t" \
+ "dli "#r1", 0x2 \n\t" \
+ "paddh "#f24", "#f24", "#f16" \n\t" \
+ "paddh "#f26", "#f26", "#f18" \n\t" \
+ "dmfc1 "#r2", "#f8" \n\t" \
+ "paddh "#f28", "#f28", "#f12" \n\t" \
+ "paddh "#f30", "#f30", "#f14" \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f24" \n\t" \
+ "psubh "#f2", "#f2", "#f26" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "psrah "#f2", "#f2", "#f8" \n\t" \
+ "paddh "#f0", "#f0", "#f28" \n\t" \
+ "paddh "#f2", "#f2", "#f30" \n\t" \
+ "psubh "#f0", "#f0", "#f24" \n\t" \
+ "psubh "#f2", "#f2", "#f26" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "psrah "#f2", "#f2", "#f8" \n\t" \
+ "dmtc1 "#r3", "#f8" \n\t" \
+ "paddh "#f28", "#f28", "#f0" \n\t" \
+ "paddh "#f30", "#f30", "#f2" \n\t" \
+ "dli "#r1", 0x6 \n\t" \
+ "paddh "#f28", "#f28", "#f8" \n\t" \
+ "paddh "#f30", "#f30", "#f8" \n\t" \
+ "dmtc1 "#r1", "#f8" \n\t" \
+ "psrah "#f28", "#f28", "#f8" \n\t" \
+ "psrah "#f30", "#f30", "#f8" \n\t" \
+ "packushb "#f28", "#f28", "#f30" \n\t" \
+ "gssdlc1 "#f28", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f28", 0x0("#r0") \n\t" \
+ "dmtc1 "#r2", "#f8" \n\t"
+
+void McHorVer20Width5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dli $10, 0x0010001000100010 \n\t"
+ "dli $11, 0x5 \n\t"
+ "1: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "mov.d $f28, $f8 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+ "mov.d $f24, $f16 \n\t"
+ "mov.d $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dmfc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+
+ "dmtc1 $10, $f12 \n\t"
+ "paddh $f0, $f0, $f12 \n\t"
+ "paddh $f2, $f2, $f12 \n\t"
+ "dmtc1 $11, $f12 \n\t"
+ "psrah $f0, $f0, $f12 \n\t"
+ "psrah $f2, $f2, $f12 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+
+ "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
+ "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
+
+ "gsldlc1 $f0, 0xd(%[pSrc]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x6(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "psubh $f20, $f20, $f16 \n\t"
+ "psubh $f22, $f22, $f18 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+
+ "dmtc1 $10, $f24 \n\t"
+ "paddh $f8, $f8, $f24 \n\t"
+ "paddh $f10, $f10, $f24 \n\t"
+ "dmtc1 $11, $f24 \n\t"
+ "psrah $f8, $f8, $f24 \n\t"
+ "psrah $f10, $f10, $f24 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "gsswlc1 $f8, 0x4(%[pDst]) \n\t"
+ "gsswrc1 $f8, 0x1(%[pDst]) \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void McHorVer20Width9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dli $9, 0x9 \n\t"
+ "dli $10, 0x0010001000100010 \n\t"
+ "dli $11, 0x5 \n\t"
+ "bne %[iWidth], $9, 2f \n\t"
+ "1: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "mov.d $f28, $f8 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+ "mov.d $f24, $f16 \n\t"
+ "mov.d $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dmfc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+
+ "dmtc1 $10, $f12 \n\t"
+ "paddh $f0, $f0, $f12 \n\t"
+ "paddh $f2, $f2, $f12 \n\t"
+ "dmtc1 $11, $f12 \n\t"
+ "psrah $f0, $f0, $f12 \n\t"
+ "psrah $f2, $f2, $f12 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+
+ "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
+ "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
+
+ "gsldlc1 $f0, 0xd(%[pSrc]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x6(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "psubh $f20, $f20, $f16 \n\t"
+ "psubh $f22, $f22, $f18 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+
+ "dmtc1 $10, $f24 \n\t"
+ "paddh $f8, $f8, $f24 \n\t"
+ "paddh $f10, $f10, $f24 \n\t"
+ "dmtc1 $11, $f24 \n\t"
+ "psrah $f8, $f8, $f24 \n\t"
+ "psrah $f10, $f10, $f24 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "gssdlc1 $f8, 0x8(%[pDst]) \n\t"
+ "gssdrc1 $f8, 0x1(%[pDst]) \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ "j 3f \n\t"
+
+ "2: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "dmtc1 $8, $f30 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "psubh $f16, $f16, $f8 \n\t"
+ "psubh $f18, $f18, $f10 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+
+ "dmtc1 $10, $f30 \n\t"
+ "paddh $f0, $f0, $f30 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "dmtc1 $11, $f30 \n\t"
+ "psrah $f0, $f0, $f30 \n\t"
+ "psrah $f2, $f2, $f30 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+
+ "gsldlc1 $f0, 15(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0x14(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x10(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0x13(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x11(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0x12(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 8(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0xd(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x9(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0xc(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0xb(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "mov.d $f28, $f8 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+ "mov.d $f24, $f16 \n\t"
+ "mov.d $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dmfc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+
+ "dmtc1 $10, $f30 \n\t"
+ "paddh $f0, $f0, $f30 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "dmtc1 $11, $f30 \n\t"
+ "psrah $f0, $f0, $f30 \n\t"
+ "psrah $f2, $f2, $f30 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "gsswlc1 $f0, 0xb(%[pDst]) \n\t"
+ "gsswrc1 $f0, 0x8(%[pDst]) \n\t"
+
+ "dmtc1 $9, $f12 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $9, 0x20 \n\t"
+ "gsldlc1 $f0, 0x15(%[pSrc]) \n\t"
+ "dmtc1 $9, $f30 \n\t"
+ "gsldrc1 $f0, 0xE(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "psubh $f20, $f20, $f16 \n\t"
+ "psubh $f22, $f22, $f18 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+
+ "dmtc1 $10, $f24 \n\t"
+ "paddh $f8, $f8, $f24 \n\t"
+ "paddh $f10, $f10, $f24 \n\t"
+ "dmtc1 $11, $f24 \n\t"
+ "psrah $f8, $f8, $f24 \n\t"
+ "psrah $f10, $f10, $f24 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "gssdlc1 $f8, 0x10(%[pDst]) \n\t"
+ "gssdrc1 $f8, 0x9(%[pDst]) \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 2b \n\t"
+ "3: \n\t"
+ : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+static inline void McHorVer20Width5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 17 || iWidth == 9)
+ McHorVer20Width9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+ else //if (iWidth == 5)
+ McHorVer20Width5_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void McHorVer02Height5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $12, %[pSrc] \n\t"
+ "move $13, %[pDst] \n\t"
+ "move $14, %[iHeight] \n\t"
+
+ "dsrl %[iWidth], %[iWidth], 0x2 \n\t"
+ PTR_ADDU "$10, %[iSrcStride], %[iSrcStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
+
+ "1: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f4, $f6, $f28, $8)
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f12, $f14, $f28, $8)
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f20, $f22, $f28, $8)
+ FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+ "mov.d $f0, $f4 \n\t"
+ "mov.d $f2, $f6 \n\t"
+ "mov.d $f4, $f8 \n\t"
+ "mov.d $f6, $f10 \n\t"
+ "mov.d $f8, $f12 \n\t"
+ "mov.d $f10, $f14 \n\t"
+ "mov.d $f12, $f16 \n\t"
+ "mov.d $f14, $f18 \n\t"
+ "mov.d $f16, $f20 \n\t"
+ "mov.d $f18, $f22 \n\t"
+ "mov.d $f20, $f24 \n\t"
+ "mov.d $f22, $f26 \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+ "2: \n\t"
+ FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W4($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f28, $f30, $f0, $8)
+ FILTER_HV_W4($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+ $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W4($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+ $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f4, $f6, $f8, $8)
+ FILTER_HV_W4($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
+ $f8, $f10, $f12, $f14, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W4($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
+ $f12, $f14, $f16, $f18, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f12, $f14, $f16, $8)
+ FILTER_HV_W4($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
+ $f16, $f18, $f20, $f22, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W4($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
+ $f20, $f22, $f24, $f26, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f20, $f22, $f24, $8)
+ "j 2b \n\t"
+
+ "3: \n\t"
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
+ "beqz %[iWidth], 4f \n\t"
+ "move %[pSrc], $12 \n\t"
+ "move %[pDst], $13 \n\t"
+ "move %[iHeight], $14 \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], 0x4 \n\t"
+ PTR_ADDIU "%[pDst], %[pDst], 0x4 \n\t"
+ "j 1b \n\t"
+ "4: \n\t"
+ : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+ [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void McHorVer02Height9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $12, %[pSrc] \n\t"
+ "move $13, %[pDst] \n\t"
+ "move $14, %[iHeight] \n\t"
+
+ "dsrl %[iWidth], %[iWidth], 0x3 \n\t"
+ PTR_ADDU "$10, %[iSrcStride], %[iSrcStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
+
+ "1: \n\t"
+ "dli $8, 0x20 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f4, $f6, $f28, $8)
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f12, $f14, $f28, $8)
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f20, $f22, $f28, $8)
+ FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+ "mov.d $f0, $f4 \n\t"
+ "mov.d $f2, $f6 \n\t"
+ "mov.d $f4, $f8 \n\t"
+ "mov.d $f6, $f10 \n\t"
+ "mov.d $f8, $f12 \n\t"
+ "mov.d $f10, $f14 \n\t"
+ "mov.d $f12, $f16 \n\t"
+ "mov.d $f14, $f18 \n\t"
+ "mov.d $f16, $f20 \n\t"
+ "mov.d $f18, $f22 \n\t"
+ "mov.d $f20, $f24 \n\t"
+ "mov.d $f22, $f26 \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+ "2: \n\t"
+ FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+ "dmtc1 $9, $f8 \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f28, $f30, $f0, $8)
+ FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+ $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+ $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f4, $f6, $f8, $8)
+ FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
+ $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
+ $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f12, $f14, $f16, $8)
+ FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t"
+ MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
+ $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 3f \n\t"
+
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f20, $f22, $f24, $8)
+ "j 2b \n\t"
+
+ "3: \n\t"
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
+ "beqz %[iWidth], 4f \n\t"
+
+ "move %[pSrc], $12 \n\t"
+ "move %[pDst], $13 \n\t"
+ "move %[iHeight], $14 \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], 0x8 \n\t"
+ PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t"
+ "j 1b \n\t"
+ "4: \n\t"
+ : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+ [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02Height5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 16 || iWidth == 8)
+ McHorVer02Height9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight );
+ else
+ McHorVer02Height5_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+static inline void McHorVer22HorFirst_mmi(const uint8_t *pSrc, int32_t iSrcStride,
+ uint8_t * pTap, int32_t iTapStride,
+ int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dli $8, 0x9 \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "bne %[iWidth], $8, 2f \n\t"
+
+ "1: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "mov.d $f28, $f8 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+ "mov.d $f24, $f16 \n\t"
+ "mov.d $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmfc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "gsswlc1 $f0, 0x3(%[pTap]) \n\t"
+ "gsswrc1 $f0, 0x0(%[pTap]) \n\t"
+
+ "gsldlc1 $f0, 0xd(%[pSrc]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x6(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "psubh $f20, $f20, $f16 \n\t"
+ "psubh $f22, $f22, $f18 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gssdlc1 $f8, 0x9(%[pTap]) \n\t"
+ "gssdlc1 $f10, 0x11(%[pTap]) \n\t"
+ "gssdrc1 $f8, 0x2(%[pTap]) \n\t"
+ "gssdrc1 $f10, 0xa(%[pTap]) \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pTap], %[pTap], %[iTapStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ "j 3f \n\t"
+
+ "2: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "dli $8, 0x2 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "dmtc1 $8, $f30 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "psubh $f16, $f16, $f8 \n\t"
+ "psubh $f18, $f18, $f10 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pTap]) \n\t"
+
+ "gsldlc1 $f0, 15(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 8(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0x14(%[pSrc]) \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldrc1 $f4, 0xd(%[pSrc]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "gsldlc1 $f8, 0x10(%[pSrc]) \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "gsldrc1 $f8, 0x9(%[pSrc]) \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "gsldlc1 $f12, 0x13(%[pSrc]) \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "gsldrc1 $f12, 0xc(%[pSrc]) \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x11(%[pSrc]) \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "gsldrc1 $f16, 0xa(%[pSrc]) \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "gsldlc1 $f20, 0x12(%[pSrc]) \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "gsldrc1 $f20, 0xb(%[pSrc]) \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+
+ "mov.d $f28, $f8 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+ "mov.d $f24, $f16 \n\t"
+ "mov.d $f26, $f18 \n\t"
+ "dli $8, 0x2 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dmfc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "gsswlc1 $f0, 0x13(%[pTap]) \n\t"
+ "gsswrc1 $f0, 0x10(%[pTap]) \n\t"
+
+ "gsldlc1 $f0, 0x15(%[pSrc]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0xE(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "psubh $f20, $f20, $f16 \n\t"
+ "psubh $f22, $f22, $f18 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psllh $f20, $f20, $f24 \n\t"
+ "psllh $f22, $f22, $f24 \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gssdlc1 $f8, 0x19(%[pTap]) \n\t"
+ "gssdlc1 $f10, 0x21(%[pTap]) \n\t"
+ "gssdrc1 $f8, 0x12(%[pTap]) \n\t"
+ "gssdrc1 $f10, 0x1a(%[pTap]) \n\t"
+
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pTap], %[pTap], %[iTapStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 2b \n\t"
+ "3: \n\t"
+ : [pSrc]"+&r"(pSrc), [pTap]"+&r"(pTap), [iWidth]"+&r"(iWidth),
+ [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iTapStride]"r"(iTapStride)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+static inline void McHorVer22Width8VerLastAlign_mmi(const uint8_t *pTap,
+ int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $10, %[pTap] \n\t"
+ "move $11, %[pDst] \n\t"
+ "move $12, %[iHeight] \n\t"
+ "dsrl %[iWidth], 0x3 \n\t"
+ PTR_ADDU "$13, %[iTapStride], %[iTapStride] \n\t"
+ PTR_ADDU "$14, %[iDstStride], %[iDstStride] \n\t"
+ "dli $15, 0x0020002000200020 \n\t"
+
+ "4: \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pTap]) \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[pTap]) \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f14, $f12, 0x0($8) \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f18, $f16, 0x0(%[pTap]) \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f22, $f20, 0x0($8) \n\t"
+
+ FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
+
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pTap]) \n\t"
+ "mov.d $f0, $f4 \n\t"
+ "mov.d $f2, $f6 \n\t"
+ "mov.d $f4, $f8 \n\t"
+ "mov.d $f6, $f10 \n\t"
+ "mov.d $f8, $f12 \n\t"
+ "mov.d $f10, $f14 \n\t"
+ "mov.d $f12, $f16 \n\t"
+ "mov.d $f14, $f18 \n\t"
+ "mov.d $f16, $f20 \n\t"
+ "mov.d $f18, $f22 \n\t"
+ "mov.d $f20, $f24 \n\t"
+ "mov.d $f22, $f26 \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_SUBU "%[pTap], %[pTap], %[iTapStride] \n\t"
+
+ "5: \n\t"
+ FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pTap]) \n\t"
+
+ FILTER_VER_ALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $f0, $f2, %[pDst], %[iDstStride], $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f30, $f28, 0x0($8) \n\t"
+
+ FILTER_VER_ALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+ $f30, $f0, $f2, $f4, $f6, %[pDst], $0, $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pTap]) \n\t"
+
+ FILTER_VER_ALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+ $f2, $f4, $f6, $f8, $f10, %[pDst], %[iDstStride], $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+
+ FILTER_VER_ALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
+ $f6, $f8, $f10, $f12, $f14, %[pDst], $0, $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[pTap]) \n\t"
+
+ FILTER_VER_ALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
+ $f10, $f12, $f14, $f16, $f18, %[pDst], %[iDstStride], $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f14, $f12, 0x0($8) \n\t"
+
+ FILTER_VER_ALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, %[pDst], $0, $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gslqc1 $f18, $f16, 0x0(%[pTap]) \n\t"
+
+ FILTER_VER_ALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
+ $f18, $f20, $f22, $f24, $f26, %[pDst], %[iDstStride], $8, $9, $15)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], $14 \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gslqc1 $f22, $f20, 0x0($8) \n\t"
+ "j 5b \n\t"
+
+ "6: \n\t"
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
+ "beqz %[iWidth], 7f \n\t"
+ "move %[pTap], $10 \n\t"
+ "move %[pDst], $11 \n\t"
+ "move %[iHeight], $12 \n\t"
+ PTR_ADDIU "%[pTap], %[pTap], 0x10 \n\t"
+ PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t"
+ "j 4b \n\t"
+ "7: \n\t"
+ : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+ "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18",
+ "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+static inline void McHorVer22Width8VerLastUnAlign_mmi(const uint8_t *pTap,
+ int32_t iTapStride, uint8_t * pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $10, %[pTap] \n\t"
+ "move $11, %[pDst] \n\t"
+ "move $12, %[iHeight] \n\t"
+ "dsrl %[iWidth], 0x3 \n\t"
+ PTR_ADDU "$13, %[iTapStride], %[iTapStride] \n\t"
+ "dli $14, 0x0020002000200020 \n\t"
+
+ "4: \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pTap]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldlc1 $f6, 0xF($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pTap]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "gsldrc1 $f6, 0x8($8) \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f8, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f10, 0xF(%[pTap]) \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldlc1 $f14, 0xF($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f10, 0x8(%[pTap]) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "gsldrc1 $f14, 0x8($8) \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f18, 0xF(%[pTap]) \n\t"
+ "gsldlc1 $f20, 0x7($8) \n\t"
+ "gsldlc1 $f22, 0xF($8) \n\t"
+ "gsldrc1 $f16, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f18, 0x8(%[pTap]) \n\t"
+ "gsldrc1 $f20, 0x0($8) \n\t"
+ "gsldrc1 $f22, 0x8($8) \n\t"
+
+ FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
+ $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gsldlc1 $f24, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f26, 0xF(%[pTap]) \n\t"
+ "gsldrc1 $f24, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f26, 0x8(%[pTap]) \n\t"
+ "mov.d $f0, $f4 \n\t"
+ "mov.d $f2, $f6 \n\t"
+ "mov.d $f4, $f8 \n\t"
+ "mov.d $f6, $f10 \n\t"
+ "mov.d $f8, $f12 \n\t"
+ "mov.d $f10, $f14 \n\t"
+ "mov.d $f12, $f16 \n\t"
+ "mov.d $f14, $f18 \n\t"
+ "mov.d $f16, $f20 \n\t"
+ "mov.d $f18, $f22 \n\t"
+ "mov.d $f20, $f24 \n\t"
+ "mov.d $f22, $f26 \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_SUBU "%[pTap], %[pTap], %[iTapStride] \n\t"
+
+ "5: \n\t"
+ FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18,
+ $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14)
+
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gsldlc1 $f24, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f26, 0xF(%[pTap]) \n\t"
+ "gsldrc1 $f24, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f26, 0x8(%[pTap]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+ FILTER_VER_UNALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22,
+ $f24, $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f28, 0x7($8) \n\t"
+ "gsldlc1 $f30, 0xF($8) \n\t"
+ "gsldrc1 $f28, 0x0($8) \n\t"
+ "gsldrc1 $f30, 0x8($8) \n\t"
+
+ FILTER_VER_UNALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+ $f28, $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gsldlc1 $f0, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pTap]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pTap]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+ FILTER_VER_UNALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+ $f30, $f0, $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldlc1 $f6, 0xF($8) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "gsldrc1 $f6, 0x8($8) \n\t"
+
+ FILTER_VER_UNALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2,
+ $f4, $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gsldlc1 $f8, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f10, 0xF(%[pTap]) \n\t"
+ "gsldrc1 $f8, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f10, 0x8(%[pTap]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+ FILTER_VER_UNALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6,
+ $f8, $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldlc1 $f14, 0xF($8) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "gsldrc1 $f14, 0x8($8) \n\t"
+
+ FILTER_VER_UNALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10,
+ $f12, $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pTap], %[pTap], $13 \n\t"
+ "gsldlc1 $f16, 0x7(%[pTap]) \n\t"
+ "gsldlc1 $f18, 0xF(%[pTap]) \n\t"
+ "gsldrc1 $f16, 0x0(%[pTap]) \n\t"
+ "gsldrc1 $f18, 0x8(%[pTap]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+ FILTER_VER_UNALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14,
+ $f16, $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9, $14)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 6f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t"
+ "gsldlc1 $f20, 0x7($8) \n\t"
+ "gsldlc1 $f22, 0xF($8) \n\t"
+ "gsldrc1 $f20, 0x0($8) \n\t"
+ "gsldrc1 $f22, 0x8($8) \n\t"
+ "j 5b \n\t"
+
+ "6: \n\t"
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t"
+ "beqz %[iWidth], 7f \n\t"
+ "move %[pTap], $10 \n\t"
+ "move %[pDst], $11 \n\t"
+ "move %[iHeight], $12 \n\t"
+ PTR_ADDIU "%[pTap], %[pTap], 0x10 \n\t"
+ PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t"
+ "j 4b \n\t"
+
+ "7: \n\t"
+ : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t* pSrc,
+ int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
+
+ if (iWidth == 17 || iWidth == 9){
+ int32_t tmp1 = 2 * (iWidth - 8);
+ McHorVer22HorFirst_mmi(pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
+
+ McHorVer22Width8VerLastAlign_mmi((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
+
+ McHorVer22Width8VerLastUnAlign_mmi((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8,
+ iDstStride, 8, iHeight);
+ } else {
+ int16_t iTmp[17 + 5];
+ int32_t i, j, k;
+
+ for (i = 0; i < iHeight; i++) {
+ for (j = 0; j < iWidth + 5; j++) {
+ iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride);
+ }
+ for (k = 0; k < iWidth; k++) {
+ pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10);
+ }
+ pSrc += iSrcStride;
+ pDst += iDstStride;
+ }
+ }
+}
+
+void McCopyWidthEq4_mmi(const uint8_t *pSrc, int iSrcStride,
+ uint8_t *pDst, int iDstStride, int iHeight) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ "lwl $8, 0x3(%[pSrc]) \n\t"
+ "lwr $8, 0x0(%[pSrc]) \n\t"
+ "swl $8, 0x3(%[pDst]) \n\t"
+ "swr $8, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8"
+ );
+}
+
+void McCopyWidthEq8_mmi(const uint8_t *pSrc, int iSrcStride,
+ uint8_t *pDst, int iDstStride, int iHeight) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ "ldl $8, 0x7(%[pSrc]) \n\t"
+ "ldr $8, 0x0(%[pSrc]) \n\t"
+ "sdl $8, 0x7(%[pDst]) \n\t"
+ "sdr $8, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8"
+ );
+}
+
+void McCopyWidthEq16_mmi(const uint8_t *pSrc, int iSrcStride,
+ uint8_t *pDst, int iDstStride, int iHeight) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ "ldl $8, 0x7(%[pSrc]) \n\t"
+ "ldl $9, 0xF(%[pSrc]) \n\t"
+ "ldr $8, 0x0(%[pSrc]) \n\t"
+ "ldr $9, 0x8(%[pSrc]) \n\t"
+ "sdl $8, 0x7(%[pDst]) \n\t"
+ "sdl $9, 0xF(%[pDst]) \n\t"
+ "sdr $8, 0x0(%[pDst]) \n\t"
+ "sdr $9, 0x8(%[pDst]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$9"
+ );
+}
+
+static inline void McCopy_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 16)
+ McCopyWidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else if (iWidth == 8)
+ McCopyWidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else if (iWidth == 4)
+ McCopyWidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else
+ McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McChromaWidthEq4_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+ int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f6, 0x7(%[pABCD]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pABCD]) \n\t"
+ "xor $f14, $f14, $f14 \n\t"
+ "punpcklbh $f6, $f6, $f6 \n\t"
+ "mov.d $f8, $f6 \n\t"
+ "punpcklhw $f6, $f6, $f6 \n\t"
+ "punpckhhw $f8, $f8, $f8 \n\t"
+ "mov.d $f10, $f6 \n\t"
+ "punpcklbh $f6, $f6, $f14 \n\t"
+ "punpckhbh $f10, $f10, $f14 \n\t"
+
+ "mov.d $f12, $f8 \n\t"
+ "punpcklbh $f8, $f8, $f14 \n\t"
+ "punpckhbh $f12, $f12, $f14 \n\t"
+ PTR_ADDU "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
+ "dli $8, 0x6 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x8(%[pSrc]) \n\t"
+ "dmtc1 $8, $f16 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x1(%[pSrc]) \n\t"
+ "dli $8, 0x0020002000200020 \n\t"
+ "punpcklbh $f0, $f0, $f14 \n\t"
+ "punpcklbh $f2, $f2, $f14 \n\t"
+
+ "dmtc1 $8, $f18 \n\t"
+ "1: \n\t"
+ "pmullh $f0, $f0, $f6 \n\t"
+ "pmullh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+
+ "gsldlc1 $f2, 0x7(%[pABCD]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pABCD]) \n\t"
+ "punpcklbh $f2, $f2, $f14 \n\t"
+ "mov.d $f4, $f2 \n\t"
+ "pmullh $f2, $f2, $f8 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+ "gsldlc1 $f2, 0x8(%[pABCD]) \n\t"
+ "gsldrc1 $f2, 0x1(%[pABCD]) \n\t"
+ "punpcklbh $f2, $f2, $f14 \n\t"
+ "mov.d $f14, $f2 \n\t"
+ "pmullh $f2, $f2, $f12 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+ "mov.d $f2, $f14 \n\t"
+ "paddh $f0, $f0, $f18 \n\t"
+ "psrlh $f0, $f0, $f16 \n\t"
+ "xor $f14, $f14, $f14 \n\t"
+ "packushb $f0, $f0, $f14 \n\t"
+ "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
+ "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
+ "mov.d $f0, $f4 \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst),
+ [pABCD]"+&r"((unsigned char *)pABCD), [iHeight]"+&r"((int)iHeight)
+ : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18"
+ );
+}
+
+void McChromaWidthEq8_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst,
+ int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f12, 0x7(%[pABCD]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f12, 0x0(%[pABCD]) \n\t"
+ "punpcklbh $f12, $f12, $f12 \n\t"
+ "punpckhhw $f14, $f12, $f12 \n\t"
+ "punpcklhw $f12, $f12, $f12 \n\t"
+
+ "mov.d $f16, $f14 \n\t"
+ "punpckhwd $f14, $f12, $f12 \n\t"
+ "punpcklwd $f12, $f12, $f12 \n\t"
+ "punpckhwd $f18, $f16, $f16 \n\t"
+ "punpcklwd $f16, $f16, $f16 \n\t"
+ "mov.d $f20, $f14 \n\t"
+ "mov.d $f24, $f18 \n\t"
+
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpckhbh $f26, $f24, $f28 \n\t"
+ "punpcklbh $f24, $f24, $f28 \n\t"
+
+ PTR_ADDU "%[pABCD], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0x8(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x1(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "1: \n\t"
+ "dli $8, 0x20 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ "pmullh $f0, $f0, $f12 \n\t"
+ "pmullh $f2, $f2, $f14 \n\t"
+ "pmullh $f4, $f4, $f20 \n\t"
+ "pmullh $f6, $f6, $f22 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pABCD]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pABCD]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "mov.d $f8, $f4 \n\t"
+ "mov.d $f10, $f6 \n\t"
+ "pmullh $f4, $f4, $f16 \n\t"
+ "pmullh $f6, $f6, $f18 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ "gsldlc1 $f4, 0x8(%[pABCD]) \n\t"
+ "gsldrc1 $f4, 0x1(%[pABCD]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "mov.d $f28, $f4 \n\t"
+ "mov.d $f30, $f6 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "pmullh $f6, $f6, $f26 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "mov.d $f4, $f28 \n\t"
+ "mov.d $f6, $f30 \n\t"
+
+ "dli $8, 0x0020002000200020 \n\t"
+ "dmfc1 $9, $f20 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "dli $8, 0x6 \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f20 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrlh $f0, $f0, $f20 \n\t"
+ "psrlh $f2, $f2, $f20 \n\t"
+
+ "xor $f28, $f28, $f28 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+
+ "mov.d $f0, $f8 \n\t"
+ "mov.d $f2, $f10 \n\t"
+ "dmtc1 $9, $f20 \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "%[pABCD], %[pABCD], %[iSrcStride] \n\t"
+
+ PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [pABCD]"+&r"(pABCD),
+ [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void McChroma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int16_t iMvX, int16_t iMvY,
+ int32_t iWidth, int32_t iHeight) {
+ static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
+ McChromaWidthEq4_mmi,
+ McChromaWidthEq8_mmi
+ };
+ const int32_t kiD8x = iMvX & 0x07;
+ const int32_t kiD8y = iMvY & 0x07;
+ if (kiD8x == 0 && kiD8y == 0) {
+ McCopy_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+ return;
+ }
+ if (iWidth != 2) {
+ kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride,
+ g_kuiABCD[kiD8y][kiD8x], iHeight);
+ } else
+ McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY,
+ iWidth, iHeight);
+}
+
+void McHorVer20WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+ int iDstStride, int iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $8, 0x0010001000100010 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x5 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "1: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "psllh $f16, $f16, $f26 \n\t"
+ "psllh $f18, $f18, $f26 \n\t"
+ "psubh $f16, $f16, $f8 \n\t"
+ "psubh $f18, $f18, $f10 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "psllh $f16, $f16, $f26 \n\t"
+ "psllh $f18, $f18, $f26 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f24 \n\t"
+ "psrah $f0, $f0, $f30 \n\t"
+ "psrah $f2, $f2, $f30 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void McHorVer20WidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+ int iDstStride, int iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
+ "dli $8, 0x0010001000100010 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x5 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "1: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "psllh $f16, $f16, $f26 \n\t"
+ "psllh $f18, $f18, $f26 \n\t"
+ "psubh $f16, $f16, $f8 \n\t"
+ "psubh $f18, $f18, $f10 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "psllh $f16, $f16, $f26 \n\t"
+ "psllh $f18, $f18, $f26 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f24 \n\t"
+ "psrah $f0, $f0, $f30 \n\t"
+ "psrah $f2, $f2, $f30 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0x14(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x10(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0x13(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x11(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0x12(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0xd(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x9(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0xc(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0xb(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "psllh $f16, $f16, $f26 \n\t"
+ "psllh $f18, $f18, $f26 \n\t"
+ "psubh $f16, $f16, $f8 \n\t"
+ "psubh $f18, $f18, $f10 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "psllh $f16, $f16, $f26 \n\t"
+ "psllh $f18, $f18, $f26 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f24 \n\t"
+ "psrah $f0, $f0, $f30 \n\t"
+ "psrah $f2, $f2, $f30 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "gssdlc1 $f0, 0xF(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x8(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void McHorVer20WidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+ int iDstStride, int iHeight) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t"
+ "xor $f14, $f14, $f14 \n\t"
+ "dli $8, 0x0010001000100010 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "1: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x3(%[pSrc]) \n\t"
+ "dli $8, 0x2 \n\t"
+ "punpcklbh $f0, $f0, $f14 \n\t"
+ "punpcklbh $f2, $f2, $f14 \n\t"
+ "punpcklbh $f4, $f4, $f14 \n\t"
+ "punpcklbh $f6, $f6, $f14 \n\t"
+ "punpcklbh $f8, $f8, $f14 \n\t"
+ "punpcklbh $f10, $f10, $f14 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+ "paddh $f4, $f4, $f6 \n\t"
+ "paddh $f8, $f8, $f10 \n\t"
+ "psllh $f8, $f8, $f16 \n\t"
+ "psubh $f8, $f8, $f4 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "dli $8, 0x5 \n\t"
+ "psllh $f8, $f8, $f16 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f0, $f0, $f12 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+ "psrah $f0, $f0, $f16 \n\t"
+ "packushb $f0, $f0, $f14 \n\t"
+ "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
+ "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16"
+ );
+}
+
+static inline void McHorVer20_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 16)
+ McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else if (iWidth == 8)
+ McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else
+ McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+}
+
+void McHorVer02WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst,
+ int iDstStride, int iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ MMI_LOAD_8P($f0, $f2, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f4, $f6, $f28, $8)
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f8, $f10, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f12, $f14, $f28, $8)
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f16, $f18, $f28, %[pSrc])
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f20, $f22, $f28, $8)
+
+ "1: \n\t"
+ FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20,
+ $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f24, $f26, $f28, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f28, $f30, $f0, $8)
+ FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28,
+ $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f0, $f2, $f4, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0,
+ $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f4, $f6, $f8, $8)
+ FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4,
+ $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f8, $f10, $f12, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8,
+ $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f12, $f14, $f16, $8)
+ FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f16, $f18, $f20, %[pSrc])
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16,
+ $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9)
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "beqz %[iHeight], 2f \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t"
+ MMI_LOAD_8P($f20, $f22, $f24, $8)
+ "j 1b \n\t"
+ "2: \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+static inline void McHorVer02WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ McHorVer02WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McHorVer02_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+ int32_t iHeight) {
+ if (iWidth == 16)
+ McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else if (iWidth == 8)
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else
+ McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+void McHorVer22Width8HorFirst_mmi(const uint8_t *pSrc, int16_t iSrcStride,
+ uint8_t *pDst, int32_t iDstStride, int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "1: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xc(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0x8(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xb(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0x9(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xa(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x5(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x1(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x4(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x2(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x3(%[pSrc]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpckhbh $f10, $f8, $f28 \n\t"
+ "punpckhbh $f14, $f12, $f28 \n\t"
+ "punpckhbh $f18, $f16, $f28 \n\t"
+ "punpckhbh $f22, $f20, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "punpcklbh $f8, $f8, $f28 \n\t"
+ "punpcklbh $f12, $f12, $f28 \n\t"
+ "punpcklbh $f16, $f16, $f28 \n\t"
+ "punpcklbh $f20, $f20, $f28 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "psubh $f16, $f16, $f8 \n\t"
+ "psubh $f18, $f18, $f10 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pDst]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight)
+ : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+static inline void McHorVer22WidthEq8_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
+ McHorVer22Width8HorFirst_mmi (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
+ McHorVer22Width8VerLastAlign_mmi ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
+}
+
+static inline void McHorVer22WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride,
+ uint8_t* pDst, int32_t iDstStride, int32_t iHeight) {
+ McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ McHorVer22WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
+}
+
+static inline void McHorVer22_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ if (iWidth == 16)
+ McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else if (iWidth == 8)
+ McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight);
+ else
+ McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight);
+}
+
+void PixelAvgWidthEq4_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
+ int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrcB]) \n\t"
+ "gsldlc1 $f2, 0x7(%[pSrcA]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrcB]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrcA]) \n\t"
+ "pavgb $f0, $f0, $f2 \n\t"
+ "gsswlc1 $f0, 0x3(%[pDst]) \n\t"
+ "gsswrc1 $f0, 0x0(%[pDst]) \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ PTR_ADDU "%[pSrcA], %[pSrcA], %[iSrcAStride] \n\t"
+ PTR_ADDU "%[pSrcB], %[pSrcB], %[iSrcBStride] \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
+ [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
+ : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
+ [iSrcBStride]"r"((int)iSrcBStride)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2"
+ );
+}
+
+void PixelAvgWidthEq8_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
+ int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t"
+ "gsldlc1 $f2, 0x7(%[pSrcB]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrcB]) \n\t"
+ "pavgb $f0, $f0, $f2 \n\t"
+ PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gsldlc1 $f0, 0x7($8) \n\t"
+ "gsldlc1 $f2, 0x7($9) \n\t"
+ "gsldrc1 $f0, 0x0($8) \n\t"
+ "gsldrc1 $f2, 0x0($9) \n\t"
+ "pavgb $f0, $f0, $f2 \n\t"
+ PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t"
+ "gssdlc1 $f0, 0x7($10) \n\t"
+ PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t"
+ "gssdrc1 $f0, 0x0($10) \n\t"
+ PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t"
+ PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x2 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
+ [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
+ : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
+ [iSrcBStride]"r"((int)iSrcBStride)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2"
+ );
+}
+
+void PixelAvgWidthEq16_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA,
+ int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "1: \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pSrcA]) \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrcB]) \n\t"
+ "gsldlc1 $f6, 0xF(%[pSrcB]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pSrcA]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrcB]) \n\t"
+ "gsldrc1 $f6, 0x8(%[pSrcB]) \n\t"
+ "pavgb $f0, $f0, $f4 \n\t"
+ "pavgb $f2, $f2, $f6 \n\t"
+ PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pDst]) \n\t"
+ PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t"
+ "gsldlc1 $f0, 0x7($8) \n\t"
+ "gsldlc1 $f2, 0xF($8) \n\t"
+ "gsldrc1 $f0, 0x0($8) \n\t"
+ "gsldrc1 $f2, 0x8($8) \n\t"
+ PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t"
+ "gsldlc1 $f4, 0x7($9) \n\t"
+ "gsldlc1 $f6, 0xF($9) \n\t"
+ "gsldrc1 $f4, 0x0($9) \n\t"
+ "gsldrc1 $f6, 0x8($9) \n\t"
+ "pavgb $f0, $f0, $f4 \n\t"
+ "pavgb $f2, $f2, $f6 \n\t"
+ "gssdlc1 $f0, 0x7($10) \n\t"
+ "gssdlc1 $f2, 0xF($10) \n\t"
+ "gssdrc1 $f0, 0x0($10) \n\t"
+ "gssdrc1 $f2, 0x8($10) \n\t"
+
+ PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t"
+ PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t"
+ PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pSrcA]) \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrcB]) \n\t"
+ "gsldlc1 $f6, 0xF(%[pSrcB]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pSrcA]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrcB]) \n\t"
+ "gsldrc1 $f6, 0x8(%[pSrcB]) \n\t"
+ "pavgb $f0, $f0, $f4 \n\t"
+ "pavgb $f2, $f2, $f6 \n\t"
+ PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t"
+ PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pDst]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pDst]) \n\t"
+ "gsldlc1 $f0, 0x7($8) \n\t"
+ "gsldlc1 $f2, 0xF($8) \n\t"
+ "gsldlc1 $f4, 0x7($9) \n\t"
+ "gsldlc1 $f6, 0xF($9) \n\t"
+ "gsldrc1 $f0, 0x0($8) \n\t"
+ "gsldrc1 $f2, 0x8($8) \n\t"
+ "gsldrc1 $f4, 0x0($9) \n\t"
+ "gsldrc1 $f6, 0x8($9) \n\t"
+ PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t"
+ "pavgb $f0, $f0, $f4 \n\t"
+ "pavgb $f2, $f2, $f6 \n\t"
+ "gssdlc1 $f0, 0x7($10) \n\t"
+ "gssdlc1 $f2, 0xF($10) \n\t"
+ "gssdrc1 $f0, 0x0($10) \n\t"
+ "gssdrc1 $f2, 0x8($10) \n\t"
+ PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t"
+ PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t"
+ PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x4 \n\t"
+ "bnez %[iHeight], 1b \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA),
+ [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight)
+ : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride),
+ [iSrcBStride]"r"((int)iSrcBStride)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6"
+ );
+}
+
+static inline void McHorVer01_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+ } else {
+ McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+ }
+}
+
+static inline void McHorVer03_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+ } else {
+ McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight);
+ }
+}
+
+static inline void McHorVer10_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pTmp, 16, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+ }
+}
+
+static inline void McHorVer11_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ }
+}
+
+static inline void McHorVer12_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+ McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+ McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+ } else {
+ McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+ McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+ }
+}
+static inline void McHorVer13_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4 , iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ }
+}
+static inline void McHorVer21_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+ }
+}
+
+static inline void McHorVer23_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+ }
+}
+static inline void McHorVer30_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight);
+ }
+}
+static inline void McHorVer31_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ }
+}
+static inline void McHorVer32_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+ McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+ McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+ } else {
+ McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+ McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+ }
+}
+static inline void McHorVer33_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst,
+ int32_t iDstStride, int32_t iWidth, int32_t iHeight) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16);
+ if (iWidth == 16) {
+ McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else if (iWidth == 8) {
+ McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight);
+ PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ } else {
+ McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+ McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight);
+ PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+ }
+}
+
+void McLuma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+ int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
+ static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y]
+ {McCopy_mmi, McHorVer01_mmi, McHorVer02_mmi, McHorVer03_mmi},
+ {McHorVer10_mmi, McHorVer11_mmi, McHorVer12_mmi, McHorVer13_mmi},
+ {McHorVer20_mmi, McHorVer21_mmi, McHorVer22_mmi, McHorVer23_mmi},
+ {McHorVer30_mmi, McHorVer31_mmi, McHorVer32_mmi, McHorVer33_mmi},
+ };
+
+ pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void PixelAvg_mmi(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+ const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) {
+ static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = {
+ PixelAvgWidthEq8_mmi,
+ PixelAvgWidthEq16_mmi
+ };
+ kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight);
+}
+#endif//HAVE_MMI
} // anon ns.
void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) {
@@ -1716,4 +4252,15 @@
pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1
}
#endif
+
+#if defined(HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_mmi;
+ pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_mmi;
+ pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_mmi;
+ pMcFuncs->pfSampleAveraging = PixelAvg_mmi;
+ pMcFuncs->pMcChromaFunc = McChroma_mmi;
+ pMcFuncs->pMcLumaFunc = McLuma_mmi;
+ }
+#endif//HAVE_MMI
}
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -66,10 +66,26 @@
endif
OBJS += $(COMMON_OBJSARM64)
+COMMON_ASM_MIPS_SRCS=\
+ $(COMMON_SRCDIR)/mips/copy_mb_mmi.c\
+ $(COMMON_SRCDIR)/mips/deblock_mmi.c\
+ $(COMMON_SRCDIR)/mips/expand_picture_mmi.c\
+ $(COMMON_SRCDIR)/mips/intra_pred_com_mmi.c\
+ $(COMMON_SRCDIR)/mips/satd_sad_mmi.c\
+
+COMMON_OBJSMIPS += $(COMMON_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+COMMON_OBJS += $(COMMON_OBJSMIPS)
+endif
+OBJS += $(COMMON_OBJSMIPS)
+
OBJS += $(COMMON_OBJS)
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c $(CXX_O) $<
+
+$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c $(CXX_O) $<
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $@ $<
--- a/codec/decoder/core/inc/decode_mb_aux.h
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -63,6 +63,10 @@
#endif
+#if defined(HAVE_MMI)
+void IdctResAddPred_mmi (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
+#endif//HAVE_MMI
+
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/decoder/core/inc/get_intra_predictor.h
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@@ -166,6 +166,20 @@
void WelsDecoderIChromaPredPlane_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
void WelsDecoderIChromaPredDcTop_AArch64_neon (uint8_t* pPred, const int32_t kiStride);
#endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsDecoderI16x16LumaPredDc_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredPlane_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredH_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredV_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDcTop_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI16x16LumaPredDcNA_mmi (uint8_t* pPred, const int32_t kiStride);
+
+void WelsDecoderIChromaPredDcTop_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredPlane_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderIChromaPredDc_mmi (uint8_t* pPred, const int32_t kiStride);
+void WelsDecoderI4x4LumaPredH_mmi (uint8_t* pPred, const int32_t kiStride);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/decoder/core/mips/dct_mmi.c
@@ -1,0 +1,786 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file dct_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 17/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define LOAD_2_LEFT_AND_ADD \
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
+ "lbu $9, -0x1(%[pPred]) \n\t" \
+ PTR_ADDU "$8, $8, $9 \n\t" \
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
+ "lbu $9, -0x1(%[pPred]) \n\t" \
+ PTR_ADDU "$8, $8, $9 \n\t"
+
+unsigned char mmi_dc_0x80[16] __attribute__((aligned(16))) = {
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+short mmi_wd_0x02[8] __attribute__((aligned(16))) = {2, 2, 2, 2, 2, 2, 2, 2};
+short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4, -3, -2, -1, 0};
+short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
+short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
+
+short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
+short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
+short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0, 1, 2, 3, 4};
+
+unsigned char mmi_01bytes[16]__attribute__((aligned(16))) = {1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1};
+
+void IdctResAddPred_mmi(uint8_t *pPred, const int32_t kiStride, int16_t *pRs) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dli $8, 0x1 \n\t"
+ "gsldxc1 $f0, 0x0(%[pRs], $0) \n\t"
+ "gsldxc1 $f2, 0x8(%[pRs], $0) \n\t"
+ "gsldxc1 $f4, 0x10(%[pRs], $0) \n\t"
+ "gsldxc1 $f6, 0x18(%[pRs], $0) \n\t"
+ "dmtc1 $8, $f14 \n\t"
+
+ MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
+ MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f14)
+ MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
+ MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f14)
+
+ "dli $8, 0x20 \n\t"
+ "xor $f14, $f14, $f14 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "pshufh $f12, $f12, $f14 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+
+ MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
+ : [pPred]"+&r"((unsigned char *)pPred)
+ : [pRs]"r"((unsigned char *)pRs), [kiStride]"r"((int)kiStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16"
+ );
+}
+
+void WelsDecoderI16x16LumaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "dli $8, 0x5 \n\t"
+ "gsldxc1 $f10, 0x0(%[mmi_01bytes], $0) \n\t"
+ "dmtc1 $8, $f8 \n\t"
+
+ "move $10, %[pPred] \n\t"
+ PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f4 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "psrlw $f0, $f0, $f8 \n\t"
+ "pmuluw $f0, $f0, $f10 \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0($10) \n\t"
+ : [pPred] "+&r"((unsigned char *)pPred)
+ : [kiStride] "r"((int)kiStride),
+ [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10"
+ );
+}
+
+void WelsDecoderI16x16LumaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $10, %[pPred] \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], -0x1 \n\t"
+ PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pPred]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pPred]) \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_dec]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "gsldlc1 $f4, 0x10(%[pPred]) \n\t"
+ "pmullh $f2, $f2, $f22 \n\t"
+ "gsldrc1 $f4, 0x9(%[pPred]) \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[mmi_plane_inc]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "pmullh $f6, $f6, $f26 \n\t"
+ "psubh $f4, $f4, $f0 \n\t"
+ "psubh $f6, $f6, $f2 \n\t"
+
+ SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+ "dmfc1 $8, $f4 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x5 \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "sra $8, $8, 0x6 \n\t"
+ MMI_Copy8Times($f4, $f6, $f28, $8)
+
+ "lbu $9, 0x10(%[pPred]) \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
+ LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
+ %[kiStride], $11)
+
+ PTR_ADDIU "%[pPred], %[pPred], 0x3 \n\t"
+ "dsll $11, %[kiStride], 0x3 \n\t"
+ PTR_ADDU "$11, $11, %[pPred] \n\t"
+ "lbu $8, 0x0($11) \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ "dsll $9, $9, 0x4 \n\t"
+
+ PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
+ %[kiStride], $11)
+
+ "xor $f16, $f16, $f16 \n\t"
+ "punpcklbh $f0, $f2, $f16 \n\t"
+ "punpckhbh $f2, $f2, $f16 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "pmullh $f2, $f2, $f22 \n\t"
+ "punpcklbh $f28, $f30, $f16 \n\t"
+ "punpckhbh $f30, $f30, $f16 \n\t"
+ "pmullh $f28, $f28, $f24 \n\t"
+ "pmullh $f30, $f30, $f26 \n\t"
+ "psubh $f28, $f28, $f0 \n\t"
+ "psubh $f30, $f30, $f2 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+
+ SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+ "dmfc1 $8, $f28 \n\t"
+ "seh $8, $8 \n\t"
+
+ "mul $8, $8, 0x5 \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "sra $8, $8, 0x6 \n\t"
+ MMI_Copy8Times($f16, $f18, $f8, $8)
+
+ "move %[pPred], $10 \n\t"
+ PTR_ADDIU "$9, $9, 0x10 \n\t"
+ "mul $8, $8, -0x7 \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ MMI_Copy8Times($f0, $f2, $f8, $9)
+
+ "xor $8, $8, $8 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_inc_minus]) \n\t"
+
+ "dli $11, 0x5 \n\t"
+ "dmtc1 $11, $f30 \n\t"
+ "1: \n\t"
+ "pmullh $f8, $f4, $f20 \n\t"
+ "pmullh $f10, $f6, $f22 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "psrah $f8, $f8, $f30 \n\t"
+ "psrah $f10, $f10, $f30 \n\t"
+ "pmullh $f12, $f4, $f24 \n\t"
+ "pmullh $f14, $f6, $f26 \n\t"
+ "paddh $f12, $f12, $f0 \n\t"
+ "paddh $f14, $f14, $f2 \n\t"
+ "psrah $f12, $f12, $f30 \n\t"
+ "psrah $f14, $f14, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f12, $f14 \n\t"
+ "gssqc1 $f10, $f8, 0x0(%[pPred]) \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x1 \n\t"
+ PTR_ADDIU "$11, $8, -0x10 \n\t"
+ "bnez $11, 1b \n\t"
+ "nop \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred)
+ : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
+ [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+#define COPY_16_TIMES(r0, f0, f2, f4, f6, f8) \
+ "gslqc1 "#f2", "#f0", -0x10("#r0") \n\t" \
+ "dsrl "#f0", "#f2", "#f4" \n\t" \
+ "pmuluw "#f0", "#f0", "#f6" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f0" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
+
+#define MMI_PRED_H_16X16_TWO_LINE_DEC \
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
+ COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8) \
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t" \
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
+ COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8) \
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+void WelsDecoderI16x16LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dli $8, 56 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "gsldxc1 $f6, 0x0(%[mmi_01bytes], $0) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+
+ COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ MMI_PRED_H_16X16_TWO_LINE_DEC
+ : [pPred]"+&r"((unsigned char *)pPred)
+ : [kiStride]"r"((int)kiStride),
+ [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+}
+
+void WelsDecoderI16x16LumaPredV_mmi(uint8_t *pPred, const int32_t kiStride) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ : [pPred] "+&r"((unsigned char *)pPred)
+ : [kiStride] "r"((int)kiStride)
+ : "memory", "$f0", "$f2"
+ );
+}
+
+void WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "$8, %[pPred], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "pasubub $f0, $f0, $f28 \n\t"
+ "pasubub $f2, $f2, $f28 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+ "dmfc1 $8, $f0 \n\t"
+
+ PTR_ADDIU "$8, $8, 0x8 \n\t"
+ "dsra $8, $8, 0x4 \n\t"
+ MMI_Copy16Times($f4, $f6, $f28, $8)
+ "mov.d $f0, $f4 \n\t"
+ "mov.d $f2, $f6 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+ RECOVER_REG;
+}
+
+void WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t *pPred, const int32_t kiStride) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[mmi_dc_0x80]) \n\t"
+ "mov.d $f4, $f0 \n\t"
+ "mov.d $f6, $f2 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
+ : [pPred] "+&r"((unsigned char *)pPred)
+ : [kiStride] "r"((int)kiStride), [mmi_dc_0x80] "r"(mmi_dc_0x80)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+}
+
+void WelsDecoderIChromaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $10, %[pPred] \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], -0x1 \n\t"
+ PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pPred]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pPred]) \n\t"
+ "gsldxc1 $f20, 0x0(%[mmi_plane_dec_c], $0) \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0xc(%[pPred]) \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "gsldrc1 $f4, 0x5(%[pPred]) \n\t"
+ "gsldxc1 $f24, 0x0(%[mmi_plane_inc_c], $0) \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "psubh $f4, $f4, $f0 \n\t"
+
+ "xor $f6, $f6, $f6 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+ "dmfc1 $8, $f4 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x11 \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "sra $8, $8, 0x5 \n\t"
+ MMI_Copy8Times($f4, $f6, $f8, $8)
+
+ "lbu $9, 0x8(%[pPred]) \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
+ LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
+
+ PTR_ADDIU "%[pPred], %[pPred], 0x3 \n\t"
+ "dsll $11, %[kiStride], 0x2 \n\t"
+ PTR_ADDU "$11, $11, %[pPred] \n\t"
+ "lbu $8, 0x0($11) \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ "dsll $9, $9, 0x4 \n\t"
+
+ PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
+ "xor $f16, $f16, $f16 \n\t"
+ "punpckhbh $f0, $f0, $f16 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "punpckhbh $f28, $f28, $f16 \n\t"
+ "pmullh $f28, $f28, $f24 \n\t"
+ "psubh $f28, $f28, $f0 \n\t"
+
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+ "dmfc1 $8, $f28 \n\t"
+ "seh $8, $8 \n\t"
+
+ "mul $8, $8, 0x11 \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "sra $8, $8, 0x5 \n\t"
+ MMI_Copy8Times($f16, $f18, $f8, $8)
+
+ "move %[pPred], $10 \n\t"
+ PTR_ADDIU "$9, $9, 0x10 \n\t"
+ "mul $8, $8, -0x3 \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ MMI_Copy8Times($f0, $f2, $f8, $9)
+
+ "xor $8, $8, $8 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_mul_b_c]) \n\t"
+
+ "dli $11, 0x5 \n\t"
+ "dmtc1 $11, $f30 \n\t"
+ "1: \n\t"
+ "pmullh $f8, $f4, $f20 \n\t"
+ "pmullh $f10, $f6, $f22 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "psrah $f8, $f8, $f30 \n\t"
+ "psrah $f10, $f10, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "gssdxc1 $f8, 0x0(%[pPred], $0) \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x1 \n\t"
+ PTR_ADDIU "$11, $8, -0x8 \n\t"
+ "bnez $11, 1b \n\t"
+ "nop \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred)
+ : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
+ [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsDecoderIChromaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "move $10, %[pPred] \n\t"
+
+ PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gsldxc1 $f0, 0x0(%[pPred], $0) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "dmtc1 $8, $f2 \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pPred]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ "punpcklwd $f6, $f0, $f8 \n\t"
+ "punpckhwd $f0, $f0, $f8 \n\t"
+ "pasubub $f0, $f0, $f8 \n\t"
+ "pasubub $f6, $f6, $f8 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f6, $f6 \n\t"
+
+ "paddd $f6, $f6, $f2 \n\t"
+ "paddd $f2, $f4, $f0 \n\t"
+
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "gsldxc1 $f12, 0x0(%[mmi_01bytes], $0) \n\t"
+ "dli $8, 0x3 \n\t"
+ "dmtc1 $8, $f10 \n\t"
+
+ "paddd $f0, $f0, $f8 \n\t"
+ "dsrl $f0, $f0, $f8 \n\t"
+
+ "paddd $f4, $f4, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+
+ "paddd $f6, $f6, $f8 \n\t"
+ "paddd $f6, $f6, $f8 \n\t"
+ "dsrl $f6, $f6, $f10 \n\t"
+
+ "paddd $f2, $f2, $f8 \n\t"
+ "paddd $f2, $f2, $f8 \n\t"
+ "dsrl $f2, $f2, $f10 \n\t"
+
+ "dli $8, 0x20 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "pmuluw $f0, $f0, $f12 \n\t"
+ "pmuluw $f6, $f6, $f12 \n\t"
+ "dsll $f0, $f0, $f8 \n\t"
+ "xor $f0, $f0, $f6 \n\t"
+
+ "pmuluw $f4, $f4, $f12 \n\t"
+ "pmuluw $f2, $f2, $f12 \n\t"
+ "dsll $f2, $f2, $f8 \n\t"
+ "xor $f2, $f2, $f4 \n\t"
+
+ "gssdxc1 $f0, 0x0($10, $0) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0($10, $0) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0($10, $0) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0($10, $0) \n\t"
+
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f2, 0x0($10, $0) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f2, 0x0($10, $0) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f2, 0x0($10, $0) \n\t"
+ PTR_ADDU "$10, $10, %[kiStride] \n\t"
+ "gssdxc1 $f2, 0x0($10, $0) \n\t"
+ : [pPred] "+&r"((unsigned char *)pPred)
+ : [kiStride] "r"((int)kiStride),
+ [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12"
+ );
+}
+
+void WelsDecoderIChromaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dli $8, 0x4e \n\t"
+ "dmtc1 $8, $f16 \n\t"
+ "dli $8, 0xb1 \n\t"
+ "dmtc1 $8, $f18 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ PTR_SUBU "$8, %[pPred], %[kiStride] \n\t"
+ "gsldxc1 $f0, 0x0($8, $0) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "pshufh $f4, $f0, $f16 \n\t"
+ "pshufh $f6, $f2, $f16 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ "pshufh $f8, $f0, $f18 \n\t"
+ "pshufh $f14, $f2, $f18 \n\t"
+ "paddh $f2, $f2, $f14 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+
+ "gslqc1 $f26, $f24, 0x0(%[mmi_wd_0x02]) \n\t"
+ "paddh $f0, $f0, $f24 \n\t"
+ "paddh $f2, $f2, $f26 \n\t"
+ "psrah $f0, $f0, $f20 \n\t"
+ "psrah $f2, $f2, $f20 \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ : [pPred] "+&r"((unsigned char *)pPred)
+ : [kiStride] "r"((int)kiStride), [mmi_wd_0x02] "r"((short *)mmi_wd_0x02)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+ RECOVER_REG;
+}
+
+void WelsDecoderI4x4LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gsldxc1 $f8, 0x0(%[mmi_01bytes], $0) \n\t"
+ "lbu $8, -0x1(%[pPred]) \n\t"
+ "dmtc1 $8, $f0 \n\t"
+ "pmuluw $f0, $f0, $f8 \n\t"
+
+ PTR_ADDU "$9, %[pPred], %[kiStride] \n\t"
+ "lbu $8, -0x1($9) \n\t"
+ "dmtc1 $8, $f2 \n\t"
+ "pmuluw $f2, $f2, $f8 \n\t"
+
+ PTR_ADDU "$10, $9, %[kiStride] \n\t"
+ "lbu $8, -0x1($10) \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "pmuluw $f4, $f4, $f8 \n\t"
+
+ PTR_ADDU "$11, $10, %[kiStride] \n\t"
+ "lbu $8, -0x1($11) \n\t"
+ "dmtc1 $8, $f6 \n\t"
+ "pmuluw $f6, $f6, $f8 \n\t"
+
+ "gsswxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ "gsswxc1 $f2, 0x0($9, $0) \n\t"
+ "gsswxc1 $f4, 0x0($10, $0) \n\t"
+ "gsswxc1 $f6, 0x0($11, $0) \n\t"
+ : [pPred] "+&r"((unsigned char *)pPred)
+ : [kiStride] "r"((int)kiStride),
+ [mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+}
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -1378,6 +1378,19 @@
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_AArch64_neon;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (iCpu & WELS_CPU_MMI) {
+ pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_mmi;
+ pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_mmi;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_mmi;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_mmi;
+ pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_mmi;
+ pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_mmi;
+ pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_mmi;
+ pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_mmi;
+ }
+#endif//HAVE_MMI
}
} // namespace WelsDec
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -1023,6 +1023,24 @@
#endif
#endif
+
+#if defined(HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pCtx->pIdctResAddPredFunc = IdctResAddPred_mmi;
+ pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_mmi>;
+
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_mmi;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_mmi;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_mmi;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_mmi;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T ] = WelsDecoderI16x16LumaPredDcTop_mmi;
+ pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsDecoderI16x16LumaPredDcNA_mmi;
+ pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_mmi;
+ pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDc_mmi;
+ pCtx->pGetIChromaPredFunc[C_PRED_DC_T] = WelsDecoderIChromaPredDcTop_mmi;
+ pCtx->pGetI4x4LumaPredFunc[I4_PRED_H] = WelsDecoderI4x4LumaPredH_mmi;
+ }
+#endif//HAVE_MMI
}
//reset decoder number related statistics info
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -56,10 +56,22 @@
endif
OBJS += $(DECODER_OBJSARM64)
+DECODER_ASM_MIPS_SRCS=\
+ $(DECODER_SRCDIR)/core/mips/dct_mmi.c\
+
+DECODER_OBJSMIPS += $(DECODER_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+DECODER_OBJS += $(DECODER_OBJSMIPS)
+endif
+OBJS += $(DECODER_OBJSMIPS)
+
OBJS += $(DECODER_OBJS)
$(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c $(CXX_O) $<
+
+$(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c $(CXX_O) $<
$(DECODER_SRCDIR)/%.$(OBJ): $(DECODER_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $@ $<
--- a/codec/encoder/core/inc/decode_mb_aux.h
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -95,6 +95,11 @@
int16_t* pDctDc);
#endif
+#if defined(HAVE_MMI)
+void WelsIDctT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctFourT4Rec_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
+void WelsIDctRecI16x16Dc_mmi (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDctDc);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/encoder/core/inc/encode_mb_aux.h
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -147,6 +147,33 @@
void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
#endif
+
+#ifdef HAVE_MMI
+int32_t WelsGetNoneZeroCount_mmi (int16_t* pLevel);
+
+/****************************************************************************
+ * * Scan and Score functions
+ * ****************************************************************************/
+void WelsScan4x4Ac_mmi (int16_t* zig_value, int16_t* pDct);
+void WelsScan4x4DcAc_mmi (int16_t* pLevel, int16_t* pDct);
+int32_t WelsCalculateSingleCtr4x4_mmi (int16_t* pDct);
+
+/****************************************************************************
+ * * DCT functions
+ * ****************************************************************************/
+void WelsDctT4_mmi (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+void WelsDctFourT4_mmi (int16_t* pDct, uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
+
+/****************************************************************************
+ * * HDM and Quant functions
+ * ****************************************************************************/
+void WelsHadamardT4Dc_mmi (int16_t* pLumaDc, int16_t* pDct);
+
+void WelsQuant4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuant4x4Dc_mmi (int16_t* pDct, int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);
+void WelsQuantFour4x4Max_mmi (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -153,6 +153,16 @@
void WelsIChromaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsIChromaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
#endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsI16x16LumaPredDc_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+void WelsIChromaPredH_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredV_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredDc_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -124,6 +124,14 @@
int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
int32_t);
#endif
+
+#if defined (HAVE_MMI)
+int32_t WelsSampleSatd8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/encoder/core/mips/dct_mmi.c
@@ -1,0 +1,529 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file dct_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_Load4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ "gslqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gslqc1 "#f10", "#f8", 0x10("#r0") \n\t" \
+ "gslqc1 "#f18", "#f16", 0x20("#r0") \n\t" \
+ "gslqc1 "#f6", "#f4", 0x30("#r0") \n\t" \
+ MMI_XSawp_DQ(f8, f10, f4, f6, f12, f14) \
+ MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)
+
+#define MMI_SumSubDiv2(f0, f2, f4, f6, f8, f10, f12, f14, f16) \
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "psrah "#f4", "#f4", "#f16" \n\t" \
+ "psrah "#f6", "#f6", "#f16" \n\t" \
+ "psrah "#f12", "#f0", "#f16" \n\t" \
+ "psrah "#f14", "#f2", "#f16" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "paddh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f12", "#f12", "#f8" \n\t" \
+ "psubh "#f14", "#f14", "#f10" \n\t"
+
+#define MMI_IDCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28) \
+ MMI_SumSub(f24, f26, f4, f6, f20, f22) \
+ MMI_SumSubDiv2(f0, f2, f8, f10, f16, f18, f12, f14, f28) \
+ MMI_SumSub(f4, f6, f0, f2, f16, f18) \
+ MMI_SumSub(f24, f26, f12, f14, f16, f18)
+
+#define MMI_StoreDiff8p_6(f0, f2, f4, f6, f8, f12, r0, r1, f14) \
+ "paddh "#f0", "#f0", "#f8" \n\t" \
+ "paddh "#f2", "#f2", "#f8" \n\t" \
+ "psrah "#f0", "#f0", "#f14" \n\t" \
+ "psrah "#f2", "#f2", "#f14" \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ "punpckhbh "#f6", "#f4", "#f12" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f12" \n\t" \
+ "paddsh "#f4", "#f4", "#f0" \n\t" \
+ "paddsh "#f6", "#f6", "#f2" \n\t" \
+ "packushb "#f4", "#f4", "#f6" \n\t" \
+ "gssdlc1 "#f4", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f4", 0x0("#r0") \n\t"
+
+#define MMI_StoreDiff8p_5(f0, f2, f4, f6, f8, r0, r1, offset) \
+ "gsldlc1 "#f4", "#offset"+0x7("#r1") \n\t" \
+ "gsldrc1 "#f4", "#offset"+0x0("#r1") \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "paddsh "#f4", "#f4", "#f0" \n\t" \
+ "paddsh "#f6", "#f6", "#f2" \n\t" \
+ "packushb "#f4", "#f4", "#f6" \n\t" \
+ "gssdlc1 "#f4", "#offset"+0x7("#r0") \n\t" \
+ "gssdrc1 "#f4", "#offset"+0x0("#r0") \n\t"
+
+#define MMI_Load8DC(f0, f2, f4, f6, f8, f10, f12, f14, f16, r0, offset, f20) \
+ "gslqc1 "#f2", "#f0", "#offset"+0x0("#r0") \n\t" \
+ "paddh "#f0", "#f0", "#f16" \n\t" \
+ "paddh "#f2", "#f2", "#f16" \n\t" \
+ "psrah "#f0", "#f0", "#f20" \n\t" \
+ "psrah "#f2", "#f2", "#f20" \n\t" \
+ "punpckhhw "#f4", "#f0", "#f0" \n\t" \
+ "punpckhwd "#f6", "#f4", "#f4" \n\t" \
+ "punpcklwd "#f4", "#f4", "#f4" \n\t" \
+ "punpcklhw "#f8", "#f2", "#f2" \n\t" \
+ "punpckhwd "#f10", "#f8", "#f8" \n\t" \
+ "punpcklwd "#f8", "#f8", "#f8" \n\t" \
+ "punpckhhw "#f12", "#f2", "#f2" \n\t" \
+ "punpckhwd "#f14", "#f12", "#f12" \n\t" \
+ "punpcklwd "#f12", "#f12", "#f12" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f0" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f0" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f0" \n\t"
+
+#define MMI_StoreDiff4x8p(f0, f2, f4, f6, f8, f10, f12, r0, r1, r2, r3) \
+ MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0) \
+ MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8) \
+ PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
+ MMI_StoreDiff8p_5(f0, f2, f8, f10, f12, r0, r1, 0x0) \
+ MMI_StoreDiff8p_5(f4, f6, f8, f10, f12, r0, r1, 0x8)
+
+#define MMI_Load4Col(f0, f2, f4, f6, f8, r0, offset) \
+ "lh $8, "#offset"("#r0") \n\t" \
+ "dmtc1 $8, "#f0" \n\t" \
+ "lh $8, "#offset"+0x20("#r0") \n\t" \
+ "dmtc1 $8, "#f4" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f4" \n\t" \
+ "lh $8, "#offset"+0x80("#r0") \n\t" \
+ "dmtc1 $8, "#f6" \n\t" \
+ "lh $8, "#offset"+0xa0("#r0") \n\t" \
+ "dmtc1 $8, "#f8" \n\t" \
+ "punpcklwd "#f2", "#f6", "#f8" \n\t"
+
+#define MMI_SumSubD(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "paddw "#f4", "#f4", "#f0" \n\t" \
+ "paddw "#f6", "#f6", "#f2" \n\t" \
+ "psubw "#f0", "#f0", "#f8" \n\t" \
+ "psubw "#f2", "#f2", "#f10" \n\t"
+
+#define WELS_DD1(f0, f2, f_val_31) \
+ "pcmpeqh "#f0", "#f0", "#f0" \n\t" \
+ "pcmpeqh "#f2", "#f2", "#f2" \n\t" \
+ "psrlw "#f0", "#f0", "#f_val_31" \n\t" \
+ "psrlw "#f2", "#f2", "#f_val_31" \n\t"
+
+#define MMI_SumSubDiv2D(f0, f2, f4, f6, f8, f10, f12, f14, f_val_1) \
+ "paddw "#f0", "#f0", "#f4" \n\t" \
+ "paddw "#f2", "#f2", "#f6" \n\t" \
+ "paddw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f2", "#f2", "#f10" \n\t" \
+ "psraw "#f0", "#f0", "#f_val_1" \n\t" \
+ "psraw "#f2", "#f2", "#f_val_1" \n\t" \
+ "mov.d "#f12", "#f0" \n\t" \
+ "mov.d "#f14", "#f2" \n\t" \
+ "psubw "#f12", "#f12", "#f4" \n\t" \
+ "psubw "#f14", "#f14", "#f6" \n\t"
+
+#define MMI_Trans4x4W(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_XSawp_WD(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_WD(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_DQ(f0, f2, f8, f10, f12, f14) \
+ MMI_XSawp_DQ(f16, f18, f4, f6, f8, f10)
+
+#define MMI_SumSubMul2(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f0" \n\t" \
+ "mov.d "#f10", "#f2" \n\t" \
+ "paddh "#f0", "#f0", "#f0" \n\t" \
+ "paddh "#f2", "#f2", "#f2" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "paddh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "psubh "#f10", "#f10", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "psubh "#f10", "#f10", "#f6" \n\t"
+
+#define MMI_DCT(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22) \
+ MMI_SumSub(f20, f22, f8, f10, f16, f18) \
+ MMI_SumSub(f0, f2, f4, f6, f16, f18) \
+ MMI_SumSub(f8, f10, f4, f6, f16, f18) \
+ MMI_SumSubMul2(f20, f22, f0, f2, f12, f14)
+
+#define MMI_Store4x8p(r0, f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_XSawp_DQ(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_DQ(f8, f10, f12, f14, f4, f6) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f10", "#f8", 0x10("#r0") \n\t" \
+ "gssqc1 "#f18", "#f16", 0x20("#r0") \n\t" \
+ "gssqc1 "#f6", "#f4", 0x30("#r0") \n\t"
+
+#define MMI_LoadDiff4P_SINGLE(f0, f2, r0, r1, f4) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r1") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "punpcklbh "#f2", "#f2", "#f4" \n\t" \
+ "psubh "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_LoadDiff4x4P_SINGLE(f0, f2, f4, f6, r0, r1, r2, r3, f8, f10) \
+ MMI_LoadDiff4P_SINGLE(f0, f8, r0, r2, f10) \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ MMI_LoadDiff4P_SINGLE(f2, f8, r0, r2, f10) \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ MMI_LoadDiff4P_SINGLE(f4, f8, r0, r2, f10) \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ MMI_LoadDiff4P_SINGLE(f6, f8, r0, r2, f10)
+
+#define MMI_DCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
+ MMI_SumSub_SINGLE(f6, f0, f10) \
+ MMI_SumSub_SINGLE(f4, f2, f10) \
+ MMI_SumSub_SINGLE(f4, f6, f10) \
+ MMI_SumSubMul2_SINGLE(f0, f2, f8, f12)
+
+void WelsIDctT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+ int32_t iPredStride, int16_t* pDct) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pDct]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pDct]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pDct]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pDct]) \n\t"
+ "gsldlc1 $f4, 0x17(%[pDct]) \n\t"
+ "gsldrc1 $f4, 0x10(%[pDct]) \n\t"
+ "gsldlc1 $f6, 0x1F(%[pDct]) \n\t"
+ "gsldrc1 $f6, 0x18(%[pDct]) \n\t"
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f18 \n\t"
+
+ MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
+ MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f16)
+ MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
+ MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f16)
+
+ "xor $f14, $f14, $f14 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "punpcklhw $f12, $f12, $f12 \n\t"
+ "punpcklwd $f12, $f12, $f12 \n\t"
+
+ MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pRec], %[pPred], $f18)
+ : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred)
+ : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride),
+ [pDct]"r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18"
+ );
+}
+
+void WelsIDctFourT4Rec_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+ int32_t iPredStride, int16_t* pDct) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
+
+ MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+ $f0, $f2, $f30)
+ MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
+ MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
+ $f4, $f6, $f30)
+
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpcklhw $f24, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f24, $f24 \n\t"
+
+ MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+
+ PTR_ADDIU "%[pDct], %[pDct], 0x40 \n\t"
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_Load4x8p(%[pDct], $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f20, $f22)
+
+ MMI_TransTwo4x4H($f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10, $f12, $f14)
+ MMI_IDCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26,
+ $f0, $f2, $f30)
+ MMI_TransTwo4x4H($f4, $f6, $f16, $f18, $f0, $f2, $f8, $f10, $f12, $f14)
+ MMI_IDCT($f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22, $f24, $f26,
+ $f4, $f6, $f30)
+
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpcklhw $f24, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f24, $f24 \n\t"
+
+ MMI_StoreDiff8p_6($f16, $f18, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f0, $f2, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f4, $f6, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff8p_6($f8, $f10, $f20, $f22, $f24, $f28, %[pRec], %[pPred], $f26)
+ : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
+ [pDct]"+&r"((short *)pDct)
+ : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsIDctRecI16x16Dc_mmi(uint8_t* pRec, int32_t iStride, uint8_t* pPred,
+ int32_t iPredStride, int16_t* pDct) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dli $8, 0x0020 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpcklhw $f24, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f24, $f24 \n\t"
+ "dli $8, 0x6 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24,
+ %[pDct], 0x0, $f30)
+
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ MMI_Load8DC($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f24, %[pDct], 0x10, $f30)
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f0, $f2, $f4, $f6, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+
+ PTR_ADDU "%[pRec], %[pRec], %[iStride] \n\t"
+ PTR_ADDU "%[pPred], %[pPred], %[iPredStride] \n\t"
+ MMI_StoreDiff4x8p($f8, $f10, $f12, $f14, $f20, $f22, $f28, %[pRec],
+ %[pPred], %[iStride], %[iPredStride])
+ : [pRec]"+&r"((uint8_t *)pRec), [pPred]"+&r"((uint8_t *)pPred),
+ [pDct]"+&r"((short *)pDct)
+ : [iStride]"r"((int)iStride), [iPredStride]"r"((int)iPredStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsHadamardT4Dc_mmi( int16_t *luma_dc, int16_t *pDct) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ MMI_Load4Col($f4, $f6, $f20, $f24, $f0, %[pDct], 0x0)
+ MMI_Load4Col($f8, $f10, $f20, $f24, $f0, %[pDct], 0x40)
+ MMI_Load4Col($f12, $f14, $f20, $f24, $f0, %[pDct], 0x100)
+ MMI_Load4Col($f16, $f18, $f20, $f24, $f0, %[pDct], 0x140)
+
+ MMI_SumSubD($f4, $f6, $f8, $f10, $f28, $f30)
+ MMI_SumSubD($f12, $f14, $f16, $f18, $f28, $f30)
+ MMI_SumSubD($f8, $f10, $f16, $f18, $f28, $f30)
+ MMI_SumSubD($f4, $f6, $f12, $f14, $f28, $f30)
+
+ MMI_Trans4x4W($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f20, $f22)
+
+ MMI_SumSubD($f16, $f18, $f12, $f14, $f28, $f30)
+ MMI_SumSubD($f20, $f22, $f4, $f6, $f28, $f30)
+
+ "dli $8, 0x1F \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ WELS_DD1($f24, $f26, $f30)
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+
+ MMI_SumSubDiv2D($f12, $f14, $f4, $f6, $f24, $f26, $f0, $f2, $f30)
+ MMI_SumSubDiv2D($f16, $f18, $f20, $f22, $f24, $f26, $f4, $f6, $f30)
+ MMI_Trans4x4W($f12, $f14, $f0, $f2, $f4, $f6, $f16, $f18, $f8, $f10)
+
+ "packsswh $f12, $f12, $f14 \n\t"
+ "packsswh $f14, $f16, $f18 \n\t"
+
+ "packsswh $f8, $f8, $f10 \n\t"
+ "packsswh $f10, $f4, $f6 \n\t"
+ "gssqc1 $f14, $f12, 0x0(%[luma_dc]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[luma_dc]) \n\t"
+ :
+ : [luma_dc]"r"((short *)luma_dc), [pDct]"r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsDctT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
+ uint8_t *pix2, int32_t i_pix2 ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f14, $f14, $f14 \n\t"
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f16 \n\t"
+
+ MMI_LoadDiff4x4P_SINGLE($f2, $f4, $f6, $f8, %[pix1], %[i_pix1],
+ %[pix2], %[i_pix2], $f0, $f14)
+
+ MMI_DCT_SINGLE($f2, $f4, $f6, $f8, $f10, $f12, $f16)
+ MMI_Trans4x4H_SINGLE($f6, $f2, $f8, $f10, $f4)
+
+ MMI_DCT_SINGLE($f6, $f10, $f4, $f8, $f2, $f12, $f16)
+ MMI_Trans4x4H_SINGLE($f4, $f6, $f8, $f2, $f10)
+
+ "gssdlc1 $f4, 0x7(%[pDct]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pDct]) \n\t"
+ "gssdlc1 $f10, 0x17(%[pDct]) \n\t"
+ "gssdlc1 $f8, 0x1F(%[pDct]) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDct]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pDct]) \n\t"
+ "gssdrc1 $f10, 0x10(%[pDct]) \n\t"
+ "gssdrc1 $f8, 0x18(%[pDct]) \n\t"
+ : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
+ : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16"
+ );
+}
+
+void WelsDctFourT4_mmi(int16_t *pDct, uint8_t *pix1, int32_t i_pix1,
+ uint8_t *pix2, int32_t i_pix2 ) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
+
+ MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
+ MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
+ MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
+ MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
+
+ MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f0, $f2, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f4, $f6, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f8, $f10, $f24, $f26, $f28, %[pix1], %[pix2])
+ PTR_ADDU "%[pix1], %[pix1], %[i_pix1] \n\t"
+ PTR_ADDU "%[pix2], %[pix2], %[i_pix2] \n\t"
+ MMI_LoadDiff8P($f12, $f14, $f24, $f26, $f28, %[pix1], %[pix2])
+
+ MMI_DCT($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f0, $f2)
+ MMI_TransTwo4x4H($f8, $f10, $f0, $f2, $f12, $f14, $f16, $f18, $f4, $f6)
+ MMI_DCT($f0, $f2, $f16, $f18, $f4, $f6, $f12, $f14, $f20, $f22, $f8, $f10)
+ MMI_TransTwo4x4H($f16, $f18, $f8, $f10, $f4, $f6, $f12, $f14, $f0, $f2)
+
+ PTR_ADDIU "%[pDct], %[pDct], 0x40 \n\t"
+ MMI_Store4x8p(%[pDct], $f16, $f18, $f8, $f10, $f12, $f14, $f0, $f2, $f20, $f22)
+ : [pDct]"+&r"((short *)pDct), [pix1]"+&r"(pix1), [pix2]"+&r"(pix2)
+ : [i_pix1]"r"(i_pix1), [i_pix2]"r"(i_pix2)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/encoder/core/mips/quant_mmi.c
@@ -1,0 +1,553 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file quant_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f10, $f10, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
+ "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f10, $f10, $f10 \n\t"
+ "dmtc1 %[mf], $f12 \n\t"
+ "pshufh $f12, $f12, $f10 \n\t"
+
+ "dmtc1 %[ff], $f8 \n\t"
+ "pshufh $f8, $f8, $f10 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f8 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f12 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f8 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f12 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
+ );
+}
+
+void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
+ "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
+ const int16_t *mf, int16_t *max) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
+ "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
+
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f16, $f16, $f0 \n\t"
+ "pmaxsh $f18, $f18, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f16, $f16, $f0 \n\t"
+ "pmaxsh $f18, $f18, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f20, $f20, $f0 \n\t"
+ "pmaxsh $f22, $f22, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f20, $f20, $f0 \n\t"
+ "pmaxsh $f22, $f22, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f28, $f28, $f0 \n\t"
+ "pmaxsh $f30, $f30, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
+
+ "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "pcmpgth $f4, $f4, $f0 \n\t"
+ "pcmpgth $f6, $f6, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "paddush $f0, $f0, $f8 \n\t"
+ "paddush $f2, $f2, $f10 \n\t"
+ "pmulhuh $f0, $f0, $f12 \n\t"
+ "pmulhuh $f2, $f2, $f14 \n\t"
+ "pmaxsh $f28, $f28, $f0 \n\t"
+ "pmaxsh $f30, $f30, $f2 \n\t"
+ "xor $f0, $f0, $f4 \n\t"
+ "xor $f2, $f2, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "psubh $f2, $f2, $f6 \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
+
+ "mov.d $f0, $f18 \n\t"
+ "punpckhhw $f18, $f16, $f20 \n\t"
+ "punpcklhw $f16, $f16, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f22 \n\t"
+ "punpcklhw $f0, $f0, $f22 \n\t"
+
+ "mov.d $f20, $f26 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpckhhw $f22, $f20, $f30 \n\t"
+ "punpcklhw $f20, $f20, $f30 \n\t"
+
+ "mov.d $f28, $f18 \n\t"
+ "punpckhwd $f18, $f16, $f24 \n\t"
+ "punpcklwd $f16, $f16, $f24 \n\t"
+ "punpckhwd $f30, $f28, $f26 \n\t"
+ "punpcklwd $f28, $f28, $f26 \n\t"
+
+ "mov.d $f24, $f2 \n\t"
+ "punpckhwd $f2, $f0, $f20 \n\t"
+ "punpcklwd $f0, $f0, $f20 \n\t"
+ "punpckhwd $f26, $f24, $f22 \n\t"
+ "punpcklwd $f24, $f24, $f22 \n\t"
+
+ "mov.d $f20, $f18 \n\t"
+ "mov.d $f18, $f0 \n\t"
+ "mov.d $f22, $f2 \n\t"
+
+ "mov.d $f0, $f30 \n\t"
+ "mov.d $f30, $f24 \n\t"
+ "mov.d $f2, $f26 \n\t"
+
+ "pmaxsh $f0, $f0, $f16 \n\t"
+ "pmaxsh $f2, $f2, $f18 \n\t"
+
+ "pmaxsh $f0, $f0, $f20 \n\t"
+ "pmaxsh $f2, $f2, $f22 \n\t"
+
+ "pmaxsh $f0, $f0, $f28 \n\t"
+ "pmaxsh $f2, $f2, $f30 \n\t"
+
+ "mov.d $f4, $f0 \n\t"
+ "mov.d $f6, $f2 \n\t"
+
+ "mov.d $f0, $f2 \n\t"
+ "mov.d $f2, $f6 \n\t"
+
+ "pmaxsh $f0, $f0, $f4 \n\t"
+ "pmaxsh $f2, $f2, $f6 \n\t"
+
+ "gssdlc1 $f0, 0x7(%[max]) \n\t"
+ "gssdrc1 $f0, 0x0(%[max]) \n\t"
+ :
+ : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
+ [max]"r"((short *)max)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
+ "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/encoder/core/mips/score_mmi.c
@@ -1,0 +1,324 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file score_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 21/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+unsigned char nozero_count_table[] __attribute__((aligned(16))) = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+ 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+ 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+ 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+int32_t WelsGetNoneZeroCount_mmi(int16_t *level) {
+ int ret_val = 0;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[level]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[level]) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "pcmpeqh $f0, $f0, $f8 \n\t"
+ "pcmpeqh $f2, $f2, $f8 \n\t"
+ "pcmpeqh $f4, $f4, $f8 \n\t"
+ "pcmpeqh $f6, $f6, $f8 \n\t"
+ "packsshb $f4, $f4, $f6 \n\t"
+ "packsshb $f6, $f0, $f2 \n\t"
+ "pmovmskb $f0, $f4 \n\t"
+ "pmovmskb $f2, $f6 \n\t"
+ "dmfc1 $8, $f0 \n\t"
+ "dmfc1 $9, $f2 \n\t"
+ "xor $8, 0xFF \n\t"
+ "xor $9, 0xFF \n\t"
+ PTR_ADDU "$10, $8, %[nozero_count_table] \n\t"
+ "lbu $8, 0x0($10) \n\t"
+ PTR_ADDU "$10, $9, %[nozero_count_table] \n\t"
+ "lbu $9, 0x0($10) \n\t"
+ PTR_ADDU "%[ret_val], $8, $9 \n\t"
+ : [ret_val] "=r"((int)ret_val)
+ : [level] "r"((unsigned char *)level),
+ [nozero_count_table] "r"((unsigned char *)nozero_count_table)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+ return ret_val;
+}
+
+void WelsScan4x4DcAc_mmi(int16_t level[16], int16_t *pDct) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
+ "dli $8, 0x3 \n\t"
+ "dmtc1 $8, $f22 \n\t"
+ "dli $8, 0x2 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dmtc1 $0, $f28 \n\t"
+ "pextrh $f18, $f2, $f22 \n\t"
+ "pextrh $f20, $f4, $f24 \n\t"
+ "pextrh $f16, $f2, $f26 \n\t"
+ "pinsrh_2 $f4, $f4, $f18 \n\t"
+ "pinsrh_3 $f2, $f2, $f16 \n\t"
+ "pextrh $f18, $f4, $f28 \n\t"
+ "pinsrh_1 $f2, $f2, $f18 \n\t"
+ "pinsrh_0 $f4, $f4, $f20 \n\t"
+ "dli $8, 0x93 \n\t"
+ "dmtc1 $8, $f22 \n\t"
+ "dli $8, 0x39 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "punpckhwd $f10, $f0, $f2 \n\t"
+ "punpcklwd $f8, $f0, $f2 \n\t"
+ "punpckhwd $f14, $f4, $f6 \n\t"
+ "punpcklwd $f12, $f4, $f6 \n\t"
+ "mov.d $f0, $f8 \n\t"
+ "pshufh $f2, $f10, $f22 \n\t"
+ "pshufh $f4, $f12, $f24 \n\t"
+ "mov.d $f6, $f14 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[level]) \n\t"
+ "gssqc1 $f6, $f4, 0x10(%[level]) \n\t"
+ :
+ : [level] "r"((short *)level), [pDct] "r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
+
+void WelsScan4x4Ac_mmi(int16_t *zig_value, int16_t *pDct) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
+ "mov.d $f8, $f2 \n\t"
+ "mov.d $f2, $f4 \n\t"
+ "mov.d $f10, $f6 \n\t"
+
+ "mov.d $f12, $f2 \n\t"
+ "punpckhwd $f2, $f0, $f8 \n\t"
+ "punpcklwd $f0, $f0, $f8 \n\t"
+ "punpckhwd $f14, $f12, $f10 \n\t"
+ "punpcklwd $f12, $f12, $f10 \n\t"
+
+ "dmtc1 $0, $f20 \n\t"
+ "dli $8, 0x10 \n\t"
+ "dmtc1 $8, $f22 \n\t"
+ "dli $8, 0x30 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $8, 0x3 \n\t"
+ "dmtc1 $8, $f26 \n\t"
+ "dli $8, 0x93 \n\t"
+ "dmtc1 $8, $f28 \n\t"
+ "dli $8, 0x39 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "pextrh $f16, $f0, $f26 \n\t"
+ "pextrh $f18, $f2, $f26 \n\t"
+ "pinsrh_3 $f2, $f2, $f16 \n\t"
+ "pextrh $f16, $f14, $f20 \n\t"
+ "pinsrh_0 $f14, $f14, $f18 \n\t"
+ "pextrh $f18, $f12, $f20 \n\t"
+ "pinsrh_0 $f12, $f12, $f16 \n\t"
+ "pinsrh_3 $f0, $f0, $f18 \n\t"
+
+ "mov.d $f4, $f0 \n\t"
+ "pshufh $f6, $f2, $f28 \n\t"
+ "pshufh $f8, $f12, $f30 \n\t"
+ "mov.d $f10, $f14 \n\t"
+
+ "mov.d $f12, $f8 \n\t"
+ "mov.d $f14, $f10 \n\t"
+ "dsrl $f4, $f4, $f22 \n\t"
+ "pinsrh_3 $f4, $f4, $f6 \n\t"
+ "dsrl $f6, $f6, $f22 \n\t"
+ "dsll $f14, $f12, $f24 \n\t"
+ "xor $f12, $f12, $f12 \n\t"
+ "or $f4, $f4, $f12 \n\t"
+ "or $f6, $f6, $f14 \n\t"
+ "dsrl $f8, $f8, $f22 \n\t"
+ "pinsrh_3 $f8, $f8, $f10 \n\t"
+ "dsrl $f10, $f10, $f22 \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[zig_value]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[zig_value]) \n\t"
+ :
+ : [zig_value] "r"((short *)zig_value), [pDct] "r"((short *)pDct)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+unsigned char i_ds_table[]__attribute__((aligned(16))) = {
+ 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned char high_mask_table[]__attribute__((aligned(16))) = {
+ 0, 0, 0, 3, 0, 2, 3, 6, 0, 2,
+ 2, 5, 3, 5, 6, 9, 0, 1, 2, 5,
+ 2, 4, 5, 8, 3, 5, 5, 8, 6, 8,
+ 9,12, 0, 1, 1, 4, 2, 4, 5, 8,
+ 2, 4, 4, 7, 5, 7, 8,11, 3, 4,
+ 5, 8, 5, 7, 8,11, 6, 8, 8,11,
+ 9,11,12,15, 0, 1, 1, 4, 1, 3,
+ 4, 7, 2, 4, 4, 7, 5, 7, 8,11,
+ 2, 3, 4, 7, 4, 6, 7,10, 5, 7,
+ 7,10, 8,10,11,14, 3, 4, 4, 7,
+ 5, 7, 8,11, 5, 7, 7,10, 8,10,
+ 11,14, 6, 7, 8,11, 8,10,11,14,
+ 9,11,11,14,12,14,15,18, 0, 0,
+ 1, 4, 1, 3, 4, 7, 1, 3, 3, 6,
+ 4, 6, 7,10, 2, 3, 4, 7, 4, 6,
+ 7,10, 5, 7, 7,10, 8,10,11,14,
+ 2, 3, 3, 6, 4, 6, 7,10, 4, 6,
+ 6, 9, 7, 9,10,13, 5, 6, 7,10,
+ 7, 9,10,13, 8,10,10,13,11,13,
+ 14,17, 3, 4, 4, 7, 4, 6, 7,10,
+ 5, 7, 7,10, 8,10,11,14, 5, 6,
+ 7,10, 7, 9,10,13, 8,10,10,13,
+ 11,13,14,17, 6, 7, 7,10, 8,10,
+ 11,14, 8,10,10,13,11,13,14,17,
+ 9,10,11,14,11,13,14,17,12,14,
+ 14,17,15,17,18,21};
+
+unsigned char low_mask_table[]__attribute__((aligned(16))) = {
+ 0, 3, 2, 6, 2, 5, 5, 9, 1, 5,
+ 4, 8, 5, 8, 8,12, 1, 4, 4, 8,
+ 4, 7, 7,11, 4, 8, 7,11, 8,11,
+ 11,15, 1, 4, 3, 7, 4, 7, 7,11,
+ 3, 7, 6,10, 7,10,10,14, 4, 7,
+ 7,11, 7,10,10,14, 7,11,10,14,
+ 11,14,14,18, 0, 4, 3, 7, 3, 6,
+ 6,10, 3, 7, 6,10, 7,10,10,14,
+ 3, 6, 6,10, 6, 9, 9,13, 6,10,
+ 9,13,10,13,13,17, 4, 7, 6,10,
+ 7,10,10,14, 6,10, 9,13,10,13,
+ 13,17, 7,10,10,14,10,13,13,17,
+ 10,14,13,17,14,17,17,21, 0, 3,
+ 3, 7, 3, 6, 6,10, 2, 6, 5, 9,
+ 6, 9, 9,13, 3, 6, 6,10, 6, 9,
+ 9,13, 6,10, 9,13,10,13,13,17,
+ 3, 6, 5, 9, 6, 9, 9,13, 5, 9,
+ 8,12, 9,12,12,16, 6, 9, 9,13,
+ 9,12,12,16, 9,13,12,16,13,16,
+ 16,20, 3, 7, 6,10, 6, 9, 9,13,
+ 6,10, 9,13,10,13,13,17, 6, 9,
+ 9,13, 9,12,12,16, 9,13,12,16,
+ 13,16,16,20, 7,10, 9,13,10,13,
+ 13,17, 9,13,12,16,13,16,16,20,
+ 10,13,13,17,13,16,16,20,13,17,
+ 16,20,17,20,20,24};
+
+int32_t WelsCalculateSingleCtr4x4_mmi(int16_t *pDct) {
+ int32_t iSingleCtr = 0;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pDct]) \n\t"
+ "packsshb $f0, $f0, $f2 \n\t"
+ "packsshb $f2, $f4, $f6 \n\t"
+
+ "xor $f10, $f10, $f10 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+
+ "pcmpeqb $f0, $f0, $f8 \n\t"
+ "pcmpeqb $f2, $f2, $f8 \n\t"
+
+ "pmovmskb $f10, $f0 \n\t"
+ "pmovmskb $f12, $f2 \n\t"
+ "punpcklbh $f10, $f10, $f12 \n\t"
+
+ "dmfc1 $12, $f10 \n\t"
+ "dli $8, 0xffff \n\t"
+ "xor $12, $12, $8 \n\t"
+
+ "xor %[pDct], %[pDct], %[pDct] \n\t"
+ "dli $8, 0x80 \n\t"
+ "dli $9, 0x7 \n\t"
+ "dli $10, 0x100 \n\t"
+ "dli $11, 0x8 \n\t"
+
+ "1: \n\t"
+ "and $13, $12, $8 \n\t"
+ "bnez $13, 2f \n\t"
+ "nop \n\t"
+ "daddiu $9, -0x1 \n\t"
+ "dsrl $8, 1 \n\t"
+ "bnez $9, 1b \n\t"
+ "nop \n\t"
+ "2: \n\t"
+ "and $13, $12, $10 \n\t"
+ "bnez $13, 3f \n\t"
+ "nop \n\t"
+ "daddiu $11, 0x1 \n\t"
+ "dsll $10, 1 \n\t"
+ "daddiu $13, $11, -0x10 \n\t"
+ "bltz $13, 2b \n\t"
+ "nop \n\t"
+ "3: \n\t"
+ "dsubu $11, $11, $9 \n\t"
+ "daddiu $11, -0x1 \n\t"
+ PTR_ADDU "$8, %[i_ds_table], $11 \n\t"
+ "lb $10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDct], %[pDct], $10 \n\t"
+ "move $11, $12 \n\t"
+ "dli $10, 0xff \n\t"
+ "and $12, $10 \n\t"
+ "dsrl $11, 0x8 \n\t"
+ "and $11, $10 \n\t"
+ PTR_ADDU "$8, %[low_mask_table], $12 \n\t"
+ "lb $10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDct], %[pDct], $10 \n\t"
+ PTR_ADDU "$8, %[high_mask_table], $11 \n\t"
+ "lb $10, 0x0($8) \n\t"
+ PTR_ADDU "%[iSingleCtr], %[pDct], $10 \n\t"
+ : [iSingleCtr] "=r"(iSingleCtr)
+ : [pDct] "r"((short *)pDct),
+ [i_ds_table] "r"((unsigned char *)i_ds_table),
+ [high_mask_table] "r"((unsigned char *)high_mask_table),
+ [low_mask_table] "r"((unsigned char *)low_mask_table)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12"
+ );
+ return iSingleCtr;
+}
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -778,6 +778,11 @@
*pfSetNZCZero = WelsNonZeroCount_sse2;
}
#endif
+#if defined(HAVE_MMI)
+ if (iCpu & WELS_CPU_MMI) {
+ *pfSetNZCZero = WelsNonZeroCount_mmi;
+ }
+#endif
}
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
@@ -842,6 +847,19 @@
#endif
}
#endif
+
+#if defined(HAVE_MMI)
+ if (iCpu & WELS_CPU_MMI) {
+ pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_mmi;
+ pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_mmi;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_mmi;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_mmi;
+ pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_mmi;
+ pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_mmi;
+ pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_mmi;
+ pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_mmi;
+ }
+#endif//HAVE_MMI
}
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -302,5 +302,13 @@
pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_AArch64_neon;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->pfIDctT4 = WelsIDctT4Rec_mmi;
+ pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_mmi;
+ pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_mmi;
+ }
+#endif//HAVE_MMI
}
}
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -586,5 +586,31 @@
pFuncList->pfDctFourT4 = WelsDctFourT4_AArch64_neon;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmi;
+ pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmi;
+
+ pFuncList->pfGetNoneZeroCount = WelsGetNoneZeroCount_mmi;
+ pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_mmi;
+
+ pFuncList->pfQuantization4x4 = WelsQuant4x4_mmi;
+ pFuncList->pfQuantizationDc4x4 = WelsQuant4x4Dc_mmi;
+ pFuncList->pfQuantizationFour4x4 = WelsQuantFour4x4_mmi;
+ pFuncList->pfQuantizationFour4x4Max = WelsQuantFour4x4Max_mmi;
+
+ pFuncList->pfCopy16x16Aligned = WelsCopy16x16_mmi;
+ pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_mmi;
+ pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_mmi;
+
+ pFuncList->pfScan4x4 = WelsScan4x4DcAc_mmi;
+ pFuncList->pfScan4x4Ac = WelsScan4x4Ac_mmi;
+ pFuncList->pfCalculateSingleCtr4x4 = WelsCalculateSingleCtr4x4_mmi;
+
+ pFuncList->pfDctT4 = WelsDctT4_mmi;
+ pFuncList->pfDctFourT4 = WelsDctFourT4_mmi;
+ }
+#endif//HAVE_MMI
}
}
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -720,5 +720,19 @@
pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_sse2;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (kuiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_mmi;
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_mmi;
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_mmi;
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_mmi;
+
+ pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmi;
+ pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_mmi;
+ pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_mmi;
+ pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_mmi;
+ }
+#endif//HAVE_MMI
}
}
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -469,6 +469,27 @@
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_AArch64_neon;
}
#endif
+
+#if defined (HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_mmi;
+
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_mmi;
+
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_mmi;
+ }
+#endif//HAVE_MMI
}
} // namespace WelsEnc
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -82,10 +82,24 @@
endif
OBJS += $(ENCODER_OBJSARM64)
+ENCODER_ASM_MIPS_SRCS=\
+ $(ENCODER_SRCDIR)/core/mips/dct_mmi.c\
+ $(ENCODER_SRCDIR)/core/mips/quant_mmi.c\
+ $(ENCODER_SRCDIR)/core/mips/score_mmi.c\
+
+ENCODER_OBJSMIPS += $(ENCODER_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+ENCODER_OBJS += $(ENCODER_OBJSMIPS)
+endif
+OBJS += $(ENCODER_OBJSMIPS)
+
OBJS += $(ENCODER_OBJS)
$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
+
+$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c $(CXX_O) $<
$(ENCODER_SRCDIR)/%.$(OBJ): $(ENCODER_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $@ $<
--- /dev/null
+++ b/codec/processing/src/mips/vaa_mmi.c
@@ -1,0 +1,892 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file vaa_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+//f4 is 0x1, f6 is 0x8
+#define WELS_MAX_REG_MMI(f0, f2, f4, f6) \
+ "punpckhwd $f4, "#f0", "#f0" \n\t" \
+ "punpckhwd $f6, "#f2", "#f2" \n\t" \
+ "pmaxub "#f0", "#f0", $f4 \n\t" \
+ "pmaxub "#f2", "#f2", $f6 \n\t" \
+ "pshufh $f4, "#f0", "#f4" \n\t" \
+ "pshufh $f6, "#f2", "#f4" \n\t" \
+ "pmaxub "#f0", "#f0", $f4 \n\t" \
+ "pmaxub "#f2", "#f2", $f6 \n\t" \
+ "dsrl $f4, "#f0", "#f6" \n\t" \
+ "dsrl $f6, "#f2", "#f6" \n\t" \
+ "pmaxub "#f0", "#f0", $f4 \n\t" \
+ "pmaxub "#f2", "#f2", $f6 \n\t"
+
+#define WELS_SAD_SD_MAD_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+ "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
+ "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
+ "pasubub $f12, $f4, $f0 \n\t" \
+ "pasubub $f14, $f6, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw "#f4", "#f4", $f12 \n\t" \
+ "paddw "#f6", "#f6", $f14 \n\t" \
+ "pasubub $f12, $f8, $f0 \n\t" \
+ "pasubub $f14, $f10, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw "#f8", "#f8", $f12 \n\t" \
+ "paddw "#f10", "#f10", $f14 \n\t" \
+ "pasubub $f12, $f4, $f8 \n\t" \
+ "pasubub $f14, $f6, $f10 \n\t" \
+ "pmaxub "#f12", "#f12", $f12 \n\t" \
+ "pmaxub "#f14", "#f14", $f14 \n\t" \
+ "pasubub $f12, $f12, $f0 \n\t" \
+ "pasubub $f14, $f14, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw "#f0", "#f0", $f12 \n\t" \
+ "paddw "#f2", "#f2", $f14 \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
+
+#define WELS_SAD_16x2_MMI(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, r1, r2, r3) \
+ "gslqc1 "#f1", "#f2", 0x00("#r1") \n\t" \
+ "gslqc1 "#f3", "#f4", 0x00("#r2") \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
+ "gslqc1 "#f5", "#f6", 0x00("#r1") \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
+ "gslqc1 "#f7", "#f8", 0x00("#r2") \n\t" \
+ "pasubub "#f1", "#f1", "#f3" \n\t" \
+ "pasubub "#f2", "#f2", "#f4" \n\t" \
+ "biadd "#f1", "#f1" \n\t" \
+ "biadd "#f2", "#f2" \n\t" \
+ "pasubub "#f5", "#f5", "#f7" \n\t" \
+ "pasubub "#f6", "#f6", "#f8" \n\t" \
+ "biadd "#f5", "#f5" \n\t" \
+ "biadd "#f6", "#f6" \n\t" \
+ "paddw "#f9", "#f9", "#f1" \n\t" \
+ "paddw "#f9", "#f9", "#f5" \n\t" \
+ "paddw "#f10", "#f10", "#f2" \n\t" \
+ "paddw "#f10", "#f10", "#f6" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
+ PTR_ADDU ""#r2", "#r2", "#r3" \n\t"
+
+#define WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI(r0, r1, r2) \
+ "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
+ "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
+ "pasubub $f12, $f4, $f8 \n\t" \
+ "pasubub $f14, $f6, $f10 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw $f28, $f28, $f12 \n\t" \
+ "paddw $f30, $f30, $f14 \n\t" \
+ "pasubub $f12, $f4, $f8 \n\t" \
+ "pasubub $f14, $f6, $f10 \n\t" \
+ "pasubub $f8, $f4, $f0 \n\t" \
+ "pasubub $f10, $f6, $f2 \n\t" \
+ "biadd $f8, $f8 \n\t" \
+ "biadd $f10, $f10 \n\t" \
+ "paddw $f24, $f24, $f8 \n\t" \
+ "paddw $f26, $f26, $f10 \n\t" \
+ "punpcklbh $f8, $f6, $f2 \n\t" \
+ "punpckhbh $f10, $f6, $f2 \n\t" \
+ "punpckhbh $f6, $f4, $f0 \n\t" \
+ "punpcklbh $f4, $f4, $f0 \n\t" \
+ "pmaddhw $f4, $f4, $f4 \n\t" \
+ "pmaddhw $f6, $f6, $f6 \n\t" \
+ "pmaddhw $f8, $f8, $f8 \n\t" \
+ "pmaddhw $f10, $f10, $f10 \n\t" \
+ "paddw $f20, $f20, $f4 \n\t" \
+ "paddw $f22, $f22, $f6 \n\t" \
+ "paddw $f20, $f20, $f8 \n\t" \
+ "paddw $f22, $f22, $f10 \n\t" \
+ "punpcklbh $f4, $f12, $f0 \n\t" \
+ "punpckhbh $f6, $f12, $f0 \n\t" \
+ "punpcklbh $f12, $f14, $f2 \n\t" \
+ "punpckhbh $f14, $f14, $f2 \n\t" \
+ "pmaddhw $f4, $f4, $f4 \n\t" \
+ "pmaddhw $f6, $f6, $f6 \n\t" \
+ "pmaddhw $f12, $f12, $f12 \n\t" \
+ "pmaddhw $f14, $f14, $f14 \n\t" \
+ "paddw $f16, $f16, $f4 \n\t" \
+ "paddw $f18, $f18, $f6 \n\t" \
+ "paddw $f16, $f16, $f12 \n\t" \
+ "paddw $f18, $f18, $f14 \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
+
+#define WELS_SAD_BGD_SQDIFF_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+ "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
+ "punpcklbh $f8, $f4, $f0 \n\t" \
+ "punpckhbh $f10, $f4, $f0 \n\t" \
+ "punpcklbh $f12, $f6, $f2 \n\t" \
+ "punpckhbh $f14, $f6, $f2 \n\t" \
+ "pmaddhw $f8, $f8, $f8 \n\t" \
+ "pmaddhw $f10, $f10, $f10 \n\t" \
+ "pmaddhw $f12, $f12, $f12 \n\t" \
+ "pmaddhw $f14, $f14, $f14 \n\t" \
+ "paddw $f8, $f8, $f12 \n\t" \
+ "paddw $f10, $f10, $f14 \n\t" \
+ "punpckhwd $f12, $f0, $f8 \n\t" \
+ "punpckhwd $f14, $f0, $f10 \n\t" \
+ "punpcklwd $f8, $f0, $f8 \n\t" \
+ "punpcklwd $f10, $f0, $f10 \n\t" \
+ "paddw $f8, $f8, $f12 \n\t" \
+ "paddw $f10, $f10, $f14 \n\t" \
+ "paddw "#f0", "#f0", $f8 \n\t" \
+ "paddw "#f2", "#f2", $f10 \n\t" \
+ "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
+ "pasubub $f12, $f4, $f0 \n\t" \
+ "pasubub $f14, $f6, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw "#f4", "#f4", $f12 \n\t" \
+ "paddw "#f6", "#f6", $f14 \n\t" \
+ "pasubub $f12, $f8, $f0 \n\t" \
+ "pasubub $f14, $f10, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "punpcklwd $f14, $f14, $f14 \n\t" \
+ "punpckhwd $f14, $f12, $f14 \n\t" \
+ "punpcklwd $f12, $f0, $f12 \n\t" \
+ "paddw "#f4", "#f4", $f12 \n\t" \
+ "paddw "#f6", "#f6", $f14 \n\t" \
+ "pasubub $f12, $f4, $f8 \n\t" \
+ "pasubub $f14, $f6, $f10 \n\t" \
+ "pmaxub "#f8", "#f8", $f12 \n\t" \
+ "pmaxub "#f10", "#f10", $f14 \n\t" \
+ "paddw $f4, $f0, $f12 \n\t" \
+ "paddw $f6, $f0, $f14 \n\t" \
+ "pasubub $f12, $f12, $f0 \n\t" \
+ "pasubub $f14, $f14, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw "#f0", "#f0", $f12 \n\t" \
+ "paddw "#f2", "#f2", $f14 \n\t" \
+ "paddw $f12, $f0, $f4 \n\t" \
+ "paddw $f14, $f0, $f6 \n\t" \
+ "punpcklbh $f4, $f12, $f0 \n\t" \
+ "punpckhbh $f6, $f12, $f0 \n\t" \
+ "punpcklbh $f12, $f14, $f2 \n\t" \
+ "punpckhbh $f14, $f14, $f2 \n\t" \
+ "pmaddhw $f4, $f4, $f4 \n\t" \
+ "pmaddhw $f6, $f6, $f6 \n\t" \
+ "pmaddhw $f12, $f12, $f12 \n\t" \
+ "pmaddhw $f14, $f14, $f14 \n\t" \
+ "paddw "#f12", "#f12", $f4 \n\t" \
+ "paddw "#f14", "#f14", $f6 \n\t" \
+ "paddw "#f12", "#f12", $f12 \n\t" \
+ "paddw "#f14", "#f14", $f14 \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
+
+#define WELS_SAD_SUM_SQSUM_16x1_MMI(r0, r1, r2) \
+ "gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
+ "gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
+ "pasubub $f12, $f4, $f8 \n\t" \
+ "pasubub $f14, $f6, $f10 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw $f24, $f24, $f12 \n\t" \
+ "paddw $f26, $f26, $f14 \n\t" \
+ "pasubub $f12, $f4, $f0 \n\t" \
+ "pasubub $f14, $f6, $f2 \n\t" \
+ "biadd $f12, $f12 \n\t" \
+ "biadd $f14, $f14 \n\t" \
+ "paddw $f20, $f20, $f12 \n\t" \
+ "paddw $f22, $f22, $f14 \n\t" \
+ "punpcklbh $f8, $f6, $f2 \n\t" \
+ "punpckhbh $f10, $f6, $f2 \n\t" \
+ "punpckhbh $f6, $f4, $f0 \n\t" \
+ "punpcklbh $f4, $f4, $f0 \n\t" \
+ "pmaddhw $f4, $f4, $f4 \n\t" \
+ "pmaddhw $f6, $f6, $f6 \n\t" \
+ "pmaddhw $f8, $f8, $f8 \n\t" \
+ "pmaddhw $f10, $f10, $f10 \n\t" \
+ "paddw $f16, $f16, $f4 \n\t" \
+ "paddw $f18, $f18, $f6 \n\t" \
+ "paddw $f16, $f16, $f8 \n\t" \
+ "paddw $f18, $f18, $f10 \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
+ PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
+
+void VAACalcSad_mmi(const uint8_t* pCurData, const uint8_t* pRefData,
+ int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8) {
+ double ftmp[13];
+ uint64_t tmp[2];
+ mips_reg addr[3];
+
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SRL "%[iPicWidth], %[iPicWidth], 0x04 \n\t"
+ PTR_SRL "%[iPicHeight], %[iPicHeight], 0x04 \n\t"
+ "move %[addr2], %[iPicStride] \n\t"
+ PTR_SLL "%[iPicStride], %[iPicStride], 0x04 \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
+ "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
+ "1: \n\t"
+ "move %[addr0], %[pCurData] \n\t"
+ "move %[addr1], %[pRefData] \n\t"
+ "move %[tmp0], %[iPicWidth] \n\t"
+ "2: \n\t"
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ "paddw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "paddw %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
+ "swc1 %[ftmp10], 0x00(%[pSad8x8]) \n\t"
+ "swc1 %[ftmp9], 0x04(%[pSad8x8]) \n\t"
+
+ "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
+ "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
+ %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
+ %[addr0], %[addr1], %[addr2])
+ "paddw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "paddw %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
+ "swc1 %[ftmp10], 0x08(%[pSad8x8]) \n\t"
+ "swc1 %[ftmp9], 0x0c(%[pSad8x8]) \n\t"
+
+ PTR_ADDU "%[pSad8x8], %[pSad8x8], 0x10 \n\t"
+ PTR_SUBU "%[addr0], %[addr0], %[iPicStride] \n\t"
+ PTR_SUBU "%[addr1], %[addr1], %[iPicStride] \n\t"
+ PTR_ADDI "%[tmp0], %[tmp0], -0x01 \n\t"
+ PTR_ADDU "%[addr0], %[addr0], 0x10 \n\t"
+ PTR_ADDU "%[addr1], %[addr1], 0x10 \n\t"
+ "bnez %[tmp0], 2b \n\t"
+
+ PTR_ADDI "%[iPicHeight], %[iPicHeight], -0x01 \n\t"
+ PTR_ADDU "%[pCurData], %[pCurData], %[iPicStride] \n\t"
+ PTR_ADDU "%[pRefData], %[pRefData], %[iPicStride] \n\t"
+ "bnez %[iPicHeight], 1b \n\t"
+
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "swc1 %[ftmp11], 0x00(%[pFrameSad]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
+ [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
+ [pCurData]"+&r"(pCurData), [pRefData]"+&r"(pRefData),
+ [iPicHeight]"+&r"(iPicHeight), [iPicWidth]"+&r"(iPicWidth),
+ [pSad8x8]"+&r"(pSad8x8), [iPicStride]"+&r"(iPicStride),
+ [addr2]"=&r"(addr[2])
+ : [pFrameSad]"r"(pFrameSad)
+ : "memory"
+ );
+}
+
+void VAACalcSadBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+ int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8,
+ uint8_t *p_mad8x8) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $15, %[cur_data] \n\t"
+ "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
+ "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
+ "dsll $13, %[iPicStride], 0x4 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "xor $14, $14, $14 \n\t"
+ "1: \n\t"
+ "move $9, %[iPicWidth] \n\t"
+ "move $10, $15 \n\t"
+ "move $11, %[ref_data] \n\t"
+ "2: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "dli $8, 0x8 \n\t"
+ "dmtc1 $8, $f10 \n\t"
+ WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
+
+ "dmfc1 $8, $f16 \n\t"
+ "sb $8, 0x0(%[p_mad8x8]) \n\t"
+ "dmfc1 $8, $f18 \n\t"
+ "sb $8, 0x1(%[p_mad8x8]) \n\t"
+ PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
+
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "punpcklwd $f30, $f30, $f30 \n\t"
+ "punpcklwd $f26, $f26, $f26 \n\t"
+ "punpcklwd $f22, $f22, $f22 \n\t"
+
+ "punpckhwd $f30, $f28, $f30 \n\t"
+ "punpckhwd $f26, $f24, $f26 \n\t"
+ "punpckhwd $f22, $f20, $f22 \n\t"
+
+ "punpcklwd $f28, $f16, $f28 \n\t"
+ "punpcklwd $f24, $f16, $f24 \n\t"
+ "punpcklwd $f20, $f16, $f20 \n\t"
+
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+ WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
+ $15, %[ref_data], %[iPicStride])
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "dli $8, 0x8 \n\t"
+ "dmtc1 $8, $f10 \n\t"
+ WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
+
+ "dmfc1 $8, $f16 \n\t"
+ "sb $8, 0x0(%[p_mad8x8]) \n\t"
+ "dmfc1 $8, $f18 \n\t"
+ "sb $8, 0x1(%[p_mad8x8]) \n\t"
+ "punpckhwd $f4, $f28, $f30 \n\t"
+ PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
+
+ "punpcklwd $f6, $f28, $f30 \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[psad8x8]) \n\t"
+ PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
+
+ "paddw $f6, $f6, $f30 \n\t"
+ "paddw $f4, $f4, $f28 \n\t"
+ "punpckhwd $f8, $f6, $f6 \n\t"
+ "paddw $f4, $f4, $f8 \n\t"
+ "dmtc1 $14, $f6 \n\t"
+ "paddw $f6, $f6, $f4 \n\t"
+ "dmfc1 $14, $f6 \n\t"
+
+ "psubw $f24, $f24, $f20 \n\t"
+ "psubw $f26, $f26, $f22 \n\t"
+ "punpckhwd $f4, $f24, $f26 \n\t"
+ "punpcklwd $f6, $f24, $f26 \n\t"
+ "gssqc1 $f6, $f4, 0x0(%[p_sd8x8]) \n\t"
+ PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x10 \n\t"
+
+ PTR_SUBU "$15, $15, $13 \n\t"
+ PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
+ PTR_ADDIU "$15, $15, 0x10 \n\t"
+ PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
+
+ PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
+ "bnez %[iPicWidth], 2b \n\t"
+ "move %[iPicWidth], $9 \n\t"
+ "move $15, $10 \n\t"
+ "move %[ref_data], $11 \n\t"
+ PTR_ADDU "$15, $15, $13 \n\t"
+ PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
+
+ PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
+ "bnez %[iPicHeight], 1b \n\t"
+
+ "swl $14, 0x3(%[psadframe]) \n\t"
+ "swr $14, 0x0(%[psadframe]) \n\t"
+ : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+ [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
+ [p_sd8x8]"+&r"((int *)p_sd8x8), [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
+ : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+ [psadframe]"r"((int *)psadframe)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void VAACalcSadSsd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+ int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
+ int32_t *psqsum16x16, int32_t *psqdiff16x16) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $15, %[cur_data] \n\t"
+ "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
+ "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
+ "dsll $13, %[iPicStride], 0x4 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "xor $12, $12, $12 \n\t"
+ "xor $14, $14, $14 \n\t"
+ "1: \n\t"
+ "move $9, %[iPicWidth] \n\t"
+ "move $10, $15 \n\t"
+ "move $11, %[ref_data] \n\t"
+ "2: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ "dmfc1 $8, $f28 \n\t"
+ "sw $8, 0x0(%[psad8x8]) \n\t"
+ "dmfc1 $8, $f30 \n\t"
+ "sw $8, 0x4(%[psad8x8]) \n\t"
+ "paddw $f4, $f28, $f30 \n\t"
+ "dmfc1 $12, $f4 \n\t"
+ PTR_ADDU "$14, $14, $12 \n\t"
+
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
+ "dmfc1 $8, $f28 \n\t"
+ "sw $8, 0x8(%[psad8x8]) \n\t"
+ "dmfc1 $8, $f30 \n\t"
+ "paddw $f4, $f28, $f30 \n\t"
+ "sw $8, 0xc(%[psad8x8]) \n\t"
+ "dmfc1 $12, $f4 \n\t"
+ PTR_ADDU "$14, $14, $12 \n\t"
+ PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
+
+ "paddw $f24, $f24, $f26 \n\t"
+ "dmfc1 $8, $f24 \n\t"
+ "sw $8, 0x0(%[psum16x16]) \n\t"
+ PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
+ "paddw $f24, $f20, $f22 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "punpckhwd $f22, $f24, $f24 \n\t"
+ "paddw $f20, $f20, $f22 \n\t"
+ "dmfc1 $8, $f20 \n\t"
+ "sw $8, 0x0(%[psqsum16x16]) \n\t"
+ PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
+
+ "paddw $f20, $f16, $f18 \n\t"
+ "punpcklwd $f16, $f20, $f20 \n\t"
+ "punpckhwd $f18, $f20, $f20 \n\t"
+ "paddw $f16, $f16, $f18 \n\t"
+ "dmfc1 $8, $f16 \n\t"
+ "sw $8, 0x0(%[psqdiff16x16]) \n\t"
+ PTR_ADDIU "%[psqdiff16x16], %[psqdiff16x16], 0x4 \n\t"
+
+ PTR_SUBU "$15, $15, $13 \n\t"
+ PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
+ PTR_ADDIU "$15, $15, 0x10 \n\t"
+ PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
+
+ PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
+ "bnez %[iPicWidth], 2b \n\t"
+ "nop \n\t"
+ "move %[iPicWidth], $9 \n\t"
+ "move $15, $10 \n\t"
+ "move %[ref_data], $11 \n\t"
+ PTR_ADDU "$15, $15, $13 \n\t"
+ PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
+
+ PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
+ "bnez %[iPicHeight], 1b \n\t"
+ "nop \n\t"
+
+ "sw $14, 0x0(%[psadframe]) \n\t"
+ : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+ [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
+ [psqsum16x16]"+&r"((int *)psqsum16x16), [psqdiff16x16]"+&r"((int *)psqdiff16x16)
+ : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+ [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void VAACalcSadSsdBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+ int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
+ int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8,
+ uint8_t *p_mad8x8) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $15, %[cur_data] \n\t"
+ "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
+ "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
+ "dsll $13, %[iPicStride], 0x4 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "xor $12, $12, $12 \n\t"
+ "xor $14, $14, $14 \n\t"
+ "1: \n\t"
+ "move $9, %[iPicWidth] \n\t"
+ "move $10, $15 \n\t"
+ "move $11, %[ref_data] \n\t"
+ "2: \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+
+ "dmfc1 $8, $f28 \n\t"
+ "sw $8, 0x0(%[psad8x8]) \n\t"
+ "dmfc1 $8, $f30 \n\t"
+ "sw $8, 0x4(%[psad8x8]) \n\t"
+ PTR_ADDIU "%[psad8x8], %[psad8x8], 0x8 \n\t"
+
+ "paddw $f4, $f28, $f30 \n\t"
+ "dmfc1 $12, $f4 \n\t"
+ PTR_ADDU "$14, $14, $12 \n\t"
+
+ "paddw $f4, $f24, $f26 \n\t"
+ "dmfc1 $8, $f4 \n\t"
+ "sw $8, 0x0(%[psum16x16]) \n\t"
+
+ "punpckhwd $f4, $f24, $f26 \n\t"
+ "punpcklwd $f6, $f24, $f26 \n\t"
+ "psubw $f6, $f6, $f4 \n\t"
+ "dmfc1 $8, $f6 \n\t"
+ PTR_S "$8, 0x0(%[p_sd8x8]) \n\t"
+ PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x8 \n\t"
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "dli $8, 0x8 \n\t"
+ "dmtc1 $8, $f10 \n\t"
+ WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
+
+ "dmfc1 $8, $f20 \n\t"
+ "sb $8, 0x0(%[p_mad8x8]) \n\t"
+ "dmfc1 $8, $f22 \n\t"
+ "sb $8, 0x1(%[p_mad8x8]) \n\t"
+ PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
+
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "punpckhwd $f28, $f20, $f28 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "punpckhwd $f30, $f20, $f30 \n\t"
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+ WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
+ $f18, $15, %[ref_data], %[iPicStride])
+
+ "dmfc1 $8, $f28 \n\t"
+ "sw $8, 0x0(%[psad8x8]) \n\t"
+ "dmfc1 $8, $f30 \n\t"
+ "sw $8, 0x4(%[psad8x8]) \n\t"
+ PTR_ADDIU "%[psad8x8], %[psad8x8], 0x8 \n\t"
+
+ "paddw $f4, $f28, $f30 \n\t"
+ "dmfc1 $12, $f4 \n\t"
+ PTR_ADDU "$14, $14, $12 \n\t"
+
+ "paddw $f4, $f24, $f26 \n\t"
+ "dmfc1 $8, $f4 \n\t"
+ "lw $12, 0x0(%[psum16x16]) \n\t"
+ PTR_ADDU "$8, $8, $12 \n\t"
+ "sw $8, 0x0(%[psum16x16]) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
+
+ "punpckhwd $f30, $f30, $f8 \n\t"
+ "punpckhwd $f28, $f28, $f8 \n\t"
+ "paddw $f8, $f28, $f30 \n\t"
+ "dmfc1 $8, $f8 \n\t"
+ "sw $8, 0x0(%[psqsum16x16]) \n\t"
+ PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
+
+ "punpckhwd $f4, $f24, $f26 \n\t"
+ "punpcklwd $f6, $f24, $f26 \n\t"
+ "psubw $f6, $f6, $f4 \n\t"
+ "dmfc1 $8, $f6 \n\t"
+ PTR_S "$8, 0x0(%[p_sd8x8]) \n\t"
+ PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x8 \n\t"
+
+ "dli $8, 0x1 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "dli $8, 0x8 \n\t"
+ "dmtc1 $8, $f10 \n\t"
+ WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
+
+ "dmfc1 $8, $f20 \n\t"
+ "sb $8, 0x0(%[p_mad8x8]) \n\t"
+ "dmfc1 $8, $f22 \n\t"
+ "sb $8, 0x1(%[p_mad8x8]) \n\t"
+ PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
+
+ "paddw $f20, $f16, $f18 \n\t"
+ "punpcklwd $f16, $f20, $f20 \n\t"
+ "punpckhwd $f18, $f20, $f20 \n\t"
+ "paddw $f16, $f16, $f18 \n\t"
+ "dmfc1 $8, $f16 \n\t"
+ "sw $8, 0x0(%[psqdiff16x16]) \n\t"
+ PTR_ADDIU "%[psqdiff16x16], %[psqdiff16x16], 0x4 \n\t"
+
+ PTR_SUBU "$15, $15, $13 \n\t"
+ PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
+ PTR_ADDIU "$15, $15, 0x10 \n\t"
+ PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
+
+ PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
+ "bnez %[iPicWidth], 2b \n\t"
+ "nop \n\t"
+ "move %[iPicWidth], $9 \n\t"
+ "move $15, $10 \n\t"
+ "move %[ref_data], $11 \n\t"
+ PTR_ADDU "$15, $15, $13 \n\t"
+ PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
+
+ PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
+ "bnez %[iPicHeight], 1b \n\t"
+ "nop \n\t"
+
+ "sw $14, 0x0(%[psadframe]) \n\t"
+ : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+ [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
+ [psum16x16]"+&r"((int *)psum16x16), [psqsum16x16]"+&r"((int *)psqsum16x16),
+ [psqdiff16x16]"+&r"((int *)psqdiff16x16), [p_sd8x8]"+&r"((int *)p_sd8x8),
+ [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
+ : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+ [psadframe]"r"((int *)psadframe)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void VAACalcSadVar_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
+ int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
+ int32_t *psqsum16x16) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "move $15, %[cur_data] \n\t"
+ "dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
+ "dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
+ "dsll $13, %[iPicStride], 0x4 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $14, $14, $14 \n\t"
+ "1: \n\t"
+ "move $9, %[iPicWidth] \n\t"
+ "move $10, $15 \n\t"
+ "move $11, %[ref_data] \n\t"
+ "2: \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ "paddw $f28, $f24, $f28 \n\t"
+ "paddw $f30, $f26, $f30 \n\t"
+ "dmfc1 $8, $f24 \n\t"
+ "sw $8, 0x0(%[psad8x8]) \n\t"
+ "dmfc1 $8, $f26 \n\t"
+ "sw $8, 0x4(%[psad8x8]) \n\t"
+
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
+ "paddw $f28, $f24, $f28 \n\t"
+ "paddw $f30, $f26, $f30 \n\t"
+ "dmfc1 $8, $f24 \n\t"
+ "sw $8, 0x8(%[psad8x8]) \n\t"
+ "dmfc1 $8, $f26 \n\t"
+ "sw $8, 0xc(%[psad8x8]) \n\t"
+ PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
+
+ "paddw $f20, $f20, $f22 \n\t"
+ "dmfc1 $8, $f20 \n\t"
+ "sw $8, 0x0(%[psum16x16]) \n\t"
+ PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
+
+ "paddw $f20, $f16, $f18 \n\t"
+ "punpcklwd $f16, $f20, $f20 \n\t"
+ "punpckhwd $f18, $f20, $f20 \n\t"
+ "paddw $f16, $f16, $f18 \n\t"
+ "dmfc1 $8, $f16 \n\t"
+ "sw $8, 0x0(%[psqsum16x16]) \n\t"
+ PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
+
+ PTR_SUBU "$15, $15, $13 \n\t"
+ PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
+ PTR_ADDIU "$15, $15, 0x10 \n\t"
+ PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
+
+ PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
+ "bnez %[iPicWidth], 2b \n\t"
+ "nop \n\t"
+ "move %[iPicWidth], $9 \n\t"
+ "move $15, $10 \n\t"
+ "move %[ref_data], $11 \n\t"
+ PTR_ADDU "$15, $15, $13 \n\t"
+ PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
+
+ PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
+ "bnez %[iPicHeight], 1b \n\t"
+ "nop \n\t"
+
+ "paddw $f28, $f28, $f30 \n\t"
+ "dmfc1 $8, $f28 \n\t"
+ "sw $8, 0x0(%[psadframe]) \n\t"
+ : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
+ [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
+ [psqsum16x16]"+&r"((int *)psqsum16x16)
+ : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
+ [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -93,6 +93,16 @@
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_AArch64_neon;
}
#endif//HAVE_NEON_AARCH64
+
+#ifdef HAVE_MMI
+ if ((iCpuFlag & WELS_CPU_MMI) == WELS_CPU_MMI) {
+ sVaaFuncs.pfVAACalcSad = VAACalcSad_mmi;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_mmi;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_mmi;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_mmi;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_mmi;
+ }
+#endif//HAVE_MMI
}
EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
--- a/codec/processing/src/vaacalc/vaacalculation.h
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -132,6 +132,16 @@
WELSVP_EXTERN_C_END
#endif
+#ifdef HAVE_MMI
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc VAACalcSadBgd_mmi;
+VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_mmi;
+VAACalcSadFunc VAACalcSad_mmi;
+VAACalcSadVarFunc VAACalcSadVar_mmi;
+VAACalcSadSsdFunc VAACalcSadSsd_mmi;
+WELSVP_EXTERN_C_END
+#endif
+
class CVAACalculation : public IStrategy {
public:
CVAACalculation (int32_t iCpuFlag);
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -58,10 +58,22 @@
endif
OBJS += $(PROCESSING_OBJSARM64)
+PROCESSING_ASM_MIPS_SRCS=\
+ $(PROCESSING_SRCDIR)/src/mips/vaa_mmi.c\
+
+PROCESSING_OBJSMIPS += $(PROCESSING_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+PROCESSING_OBJS += $(PROCESSING_OBJSMIPS)
+endif
+OBJS += $(PROCESSING_OBJSMIPS)
+
OBJS += $(PROCESSING_OBJS)
$(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<
+
+$(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<
$(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $@ $<
--- a/test/decoder/DecUT_Deblock.cpp
+++ b/test/decoder/DecUT_Deblock.cpp
@@ -146,3 +146,20 @@
GENERATE_CHROMA_UT (ChromaEq4H_AArch64_neon, DeblockChromaEq4H_AArch64_neon_wrap, DeblockChromaEq4H_c_wrap,
WELS_CPU_NEON, 1)
#endif
+
+#if defined(HAVE_MMI)
+WRAP_LUMA_FUNC (DeblockLumaEq4V_mmi)
+WRAP_LUMA_FUNC (DeblockLumaEq4H_mmi)
+WRAP_CHROMA_FUNC (DeblockChromaEq4V_mmi)
+WRAP_CHROMA_FUNC (DeblockChromaEq4H_mmi)
+
+GENERATE_LUMA_UT (LumaLt4V_mmi, DeblockLumaLt4V_mmi, DeblockLumaLt4V_c, WELS_CPU_MMI, 0)
+GENERATE_LUMA_UT (LumaLt4H_mmi, DeblockLumaLt4H_mmi, DeblockLumaLt4H_c, WELS_CPU_MMI, 1)
+GENERATE_LUMA_UT (LumaEq4V_mmi, DeblockLumaEq4V_mmi_wrap, DeblockLumaEq4V_c_wrap, WELS_CPU_MMI, 0)
+GENERATE_LUMA_UT (LumaEq4H_mmi, DeblockLumaEq4H_mmi_wrap, DeblockLumaEq4H_c_wrap, WELS_CPU_MMI, 1)
+
+GENERATE_CHROMA_UT (ChromaLt4V_mmi, DeblockChromaLt4V_mmi, DeblockChromaLt4V_c, WELS_CPU_MMI, 0)
+GENERATE_CHROMA_UT (ChromaLt4H_mmi, DeblockChromaLt4H_mmi, DeblockChromaLt4H_c, WELS_CPU_MMI, 1)
+GENERATE_CHROMA_UT (ChromaEq4V_mmi, DeblockChromaEq4V_mmi_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_MMI, 0)
+GENERATE_CHROMA_UT (ChromaEq4H_mmi, DeblockChromaEq4H_mmi_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_MMI, 1)
+#endif//HAVE_MMI
--- a/test/decoder/DecUT_DeblockCommon.cpp
+++ b/test/decoder/DecUT_DeblockCommon.cpp
@@ -540,6 +540,17 @@
DeblockingInit (&sDBFunc, 0x000004);
DB_FUNC_CPUFLAG (AArch64_neon)
#endif
+
+#ifdef HAVE_MMI
+ // pure C
+ DeblockingInit (&sDBFunc, 0x00000000);
+ DB_FUNC_CPUFLAG (c)
+
+ // mmi
+ DeblockingInit (&sDBFunc, 0x00000001);
+ DB_FUNC_CPUFLAG (mmi)
+#endif
+
}
TEST (DecoderDeblocking, WelsDeblockingFilterSlice) {
--- a/test/decoder/DecUT_IdctResAddPred.cpp
+++ b/test/decoder/DecUT_IdctResAddPred.cpp
@@ -154,6 +154,10 @@
GENERATE_IDCTRESADDPRED (IdctResAddPred_AArch64_neon, WELS_CPU_NEON)
#endif
+#if defined(HAVE_MMI)
+GENERATE_IDCTRESADDPRED (IdctResAddPred_mmi, WELS_CPU_MMI)
+#endif
+
#define GENERATE_SETNONZEROCOUNT(method, flag) \
TEST(DecoderDecodeMbAux, method) \
{\
--- a/test/decoder/DecUT_IntraPrediction.cpp
+++ b/test/decoder/DecUT_IntraPrediction.cpp
@@ -649,3 +649,16 @@
GENERATE_8x8_UT (WelsDecoderIChromaPredPlane_AArch64_neon, WelsIChromaPredPlane_ref, 1, WELS_CPU_NEON)
GENERATE_8x8_UT (WelsDecoderIChromaPredDcTop_AArch64_neon, WelsIChromaPredDcTop_ref, 1, WELS_CPU_NEON)
#endif
+
+#if defined(HAVE_MMI)
+GENERATE_4x4_UT (WelsDecoderI4x4LumaPredH_mmi, LumaI4x4PredH, 1, WELS_CPU_MMI)
+GENERATE_8x8_UT (WelsDecoderIChromaPredDcTop_mmi, WelsIChromaPredDcTop_ref, 1, WELS_CPU_MMI)
+GENERATE_8x8_UT (WelsDecoderIChromaPredDc_mmi, WelsIChromaPredDc_ref, 1, WELS_CPU_MMI)
+GENERATE_8x8_UT (WelsDecoderIChromaPredPlane_mmi, WelsIChromaPredPlane_ref, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredPlane_mmi, WelsI16x16LumaPredPlane_ref, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredH_mmi, LumaI16x16PredH, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredV_mmi, LumaI16x16PredV, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredDc_mmi, LumaI16x16PredDC, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredDcTop_mmi, LumaI16x16PredDCTop, 1, WELS_CPU_MMI)
+GENERATE_16x16_UT (WelsDecoderI16x16LumaPredDcNA_mmi, LumaI16x16PredDCNone, 1, WELS_CPU_MMI)
+#endif
--- a/test/encoder/EncUT_DecodeMbAux.cpp
+++ b/test/encoder/EncUT_DecodeMbAux.cpp
@@ -246,6 +246,11 @@
}
#endif
#endif
+#if defined(HAVE_MMI)
+TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmi) {
+ TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmi);
+}
+#endif
template<typename clip_t>
void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {
WelsIDctT4Anchor<clip_t> (&p_dst[0], dct[0]);
@@ -367,6 +372,42 @@
14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
WelsIDctRecI16x16Dc_sse2 (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
+ int ok = -1;
+ for (int i = 0; i < 16; i++) {
+ for (int j = 0; j < 16; j++) {
+ if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
+ ok = i * 16 + j;
+ break;
+ }
+ }
+ }
+ EXPECT_EQ (ok, -1);
+ }
+}
+#endif
+#if defined(HAVE_MMI)
+TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_mmi) {
+ TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_mmi);
+}
+TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_mmi) {
+ int32_t iCpuCores = 0;
+ uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
+
+ if (uiCpuFeatureFlag & WELS_CPU_MMI) {
+ uint8_t iRefDst[16 * FDEC_STRIDE];
+ int16_t iRefDct[4][4];
+ ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
+ ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
+ for (int i = 0; i < 16; i++)
+ for (int j = 0; j < 16; j++)
+ iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;
+ for (int i = 0; i < 4; i++)
+ for (int j = 0; j < 4; j++)
+ iRefDct[i][j] = iDct[i * 4 + j] = (rand() & ((1 << 15) - 1)) - (1 <<
+ 14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
+ WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
+ WelsIDctRecI16x16Dc_mmi (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
int ok = -1;
for (int i = 0; i < 16; i++) {
for (int j = 0; j < 16; j++) {
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -74,6 +74,39 @@
FREE_MEMORY (iDct);
}
#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsScan4x4Ac_mmi) {
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, iLevelA, 16);
+ ALLOC_MEMORY (int16_t, iLevelB, 16);
+ ALLOC_MEMORY (int16_t, iDct, 16);
+ for (int i = 0; i < 16; i++) {
+ iDct[i] = rand() % 256 + 1;
+ }
+ WelsScan4x4Ac_c (iLevelA, iDct);
+ WelsScan4x4Ac_mmi (iLevelB, iDct);
+ for (int j = 0; j < 16; j++)
+ EXPECT_EQ (iLevelA[j], iLevelB[j]);
+ FREE_MEMORY (iLevelA);
+ FREE_MEMORY (iLevelB);
+ FREE_MEMORY (iDct);
+}
+TEST (EncodeMbAuxTest, WelsScan4x4DcAc_mmi) {
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, iLevelA, 32);
+ ALLOC_MEMORY (int16_t, iLevelB, 32);
+ ALLOC_MEMORY (int16_t, iDct, 32);
+ for (int i = 0; i < 32; i++)
+ iDct[i] = (rand() & 32767) - 16384;
+ WelsScan4x4DcAc_mmi (iLevelA, iDct);
+ WelsScan4x4DcAc_c (iLevelB, iDct);
+ for (int i = 0; i < 16; i++)
+ EXPECT_EQ (iLevelA[i], iLevelB[i]);
+ FREE_MEMORY (iLevelA);
+ FREE_MEMORY (iLevelB);
+ FREE_MEMORY (iDct);
+}
+#endif
TEST (EncodeMbAuxTest, TestScan_4x4_dcc) {
CMemoryAlign cMemoryAlign (0);
ALLOC_MEMORY (int16_t, iLevel, 16);
@@ -236,7 +269,30 @@
FREE_MEMORY (iDctS);
}
#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsDctT4_mmi) {
+ TestDctT4 (WelsDctT4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsDctFourT4_mmi) {
+ TestDctFourT4 (WelsDctFourT4_mmi);
+}
+
+TEST (EncodeMbAuxTest, WelsCalculateSingleCtr4x4_mmi) {
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, iDctC, 16);
+ ALLOC_MEMORY (int16_t, iDctS, 16);
+ for (int i = 0; i < 16; i++)
+ iDctC[i] = iDctS[i] = (rand() & 65535) - 32768;
+ WelsCalculateSingleCtr4x4_c (iDctC);
+ WelsCalculateSingleCtr4x4_mmi (iDctS);
+ for (int i = 0; i < 16; i++)
+ EXPECT_EQ (iDctC[i], iDctS[i]);
+ FREE_MEMORY (iDctC);
+ FREE_MEMORY (iDctS);
+}
+#endif
+
void copy (uint8_t* pDst, int32_t iDStride, uint8_t* pSrc, int32_t iSStride, int32_t iWidth, int32_t iHeight) {
for (int i = 0; i < iHeight; i++)
memcpy (pDst + i * iDStride, pSrc + i * iSStride, iWidth);
@@ -271,6 +327,11 @@
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2);
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2);
#endif
+#ifdef HAVE_MMI
+GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8NotAligned_mmi);
+GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_mmi);
+GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_mmi);
+#endif
namespace {
@@ -310,6 +371,11 @@
TestGetNoneZeroCount (WelsGetNoneZeroCount_sse42);
}
#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsGetNoneZeroCount_mmi) {
+ TestGetNoneZeroCount (WelsGetNoneZeroCount_mmi);
+}
+#endif
#define WELS_ABS_LC(a) ((sign ^ (int32_t)(a)) - sign)
#define NEW_QUANT(pDct, ff, mf) (((ff)+ WELS_ABS_LC(pDct))*(mf)) >>16
#define WELS_NEW_QUANT(pDct,ff,mf) WELS_ABS_LC(NEW_QUANT(pDct, ff, mf))
@@ -473,6 +539,24 @@
}
#endif //HAVE_AVX2
#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsQuant4x4_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuant4x4 (WelsQuant4x4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuant4x4Dc_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuant4x4Dc (WelsQuant4x4Dc_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuantFour4x4 (WelsQuantFour4x4_mmi);
+}
+TEST (EncodeMbAuxTest, WelsQuantFour4x4Max_mmi) {
+ if (WelsCPUFeatureDetect (0) & WELS_CPU_MMI)
+ TestWelsQuantFour4x4Max (WelsQuantFour4x4Max_mmi);
+}
+#endif //HAVE_MMI
int32_t WelsHadamardQuant2x2SkipAnchor (int16_t* rs, int16_t ff, int16_t mf) {
int16_t pDct[4], s[4];
int16_t threshold = ((1 << 16) - 1) / mf - ff;
@@ -599,6 +683,23 @@
iDct[i] = (rand() & 32767) - 16384;
WelsHadamardT4Dc_c (iLumaDcC, iDct);
WelsHadamardT4Dc_sse2 (iLumaDcS, iDct);
+ for (int i = 0; i < 16; i++)
+ EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);
+ FREE_MEMORY (iDct);
+ FREE_MEMORY (iLumaDcC);
+ FREE_MEMORY (iLumaDcS);
+}
+#endif
+#ifdef HAVE_MMI
+TEST (EncodeMbAuxTest, WelsHadamardT4Dc_mmi) {
+ CMemoryAlign cMemoryAlign (0);
+ ALLOC_MEMORY (int16_t, iDct, 128 * 16);
+ ALLOC_MEMORY (int16_t, iLumaDcC, 16);
+ ALLOC_MEMORY (int16_t, iLumaDcS, 16);
+ for (int i = 0; i < 128 * 16; i++)
+ iDct[i] = (rand() & 32767) - 16384;
+ WelsHadamardT4Dc_c (iLumaDcC, iDct);
+ WelsHadamardT4Dc_mmi (iLumaDcS, iDct);
for (int i = 0; i < 16; i++)
EXPECT_EQ (iLumaDcC[i], iLumaDcS[i]);
FREE_MEMORY (iDct);
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -672,6 +672,20 @@
GENERATE_Sad16x16_UT (WelsSampleSatd16x16_AArch64_neon, WelsSampleSatd16x16_c, WELS_CPU_NEON)
#endif
+#ifdef HAVE_MMI
+GENERATE_Sad4x4_UT (WelsSampleSad4x4_mmi, WelsSampleSad4x4_c, WELS_CPU_MMI)
+GENERATE_Sad8x8_UT (WelsSampleSad8x8_mmi, WelsSampleSad8x8_c, WELS_CPU_MMI)
+GENERATE_Sad8x16_UT (WelsSampleSad8x16_mmi, WelsSampleSad8x16_c, WELS_CPU_MMI)
+GENERATE_Sad16x8_UT (WelsSampleSad16x8_mmi, WelsSampleSad16x8_c, WELS_CPU_MMI)
+GENERATE_Sad16x16_UT (WelsSampleSad16x16_mmi, WelsSampleSad16x16_c, WELS_CPU_MMI)
+
+GENERATE_Sad4x4_UT (WelsSampleSatd4x4_mmi, WelsSampleSatd4x4_c, WELS_CPU_MMI)
+GENERATE_Sad8x8_UT (WelsSampleSatd8x8_mmi, WelsSampleSatd8x8_c, WELS_CPU_MMI)
+GENERATE_Sad8x16_UT (WelsSampleSatd8x16_mmi, WelsSampleSatd8x16_c, WELS_CPU_MMI)
+GENERATE_Sad16x8_UT (WelsSampleSatd16x8_mmi, WelsSampleSatd16x8_c, WELS_CPU_MMI)
+GENERATE_Sad16x16_UT (WelsSampleSatd16x16_mmi, WelsSampleSatd16x16_c, WELS_CPU_MMI)
+#endif
+
#define GENERATE_SadFour_UT(func, CPUFLAGS, width, height) \
TEST_F (SadSatdAssemblyFuncTest, func) { \
if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
@@ -719,4 +733,11 @@
GENERATE_SadFour_UT (WelsSampleSadFour8x16_AArch64_neon, WELS_CPU_NEON, 8, 16)
GENERATE_SadFour_UT (WelsSampleSadFour16x8_AArch64_neon, WELS_CPU_NEON, 16, 8)
GENERATE_SadFour_UT (WelsSampleSadFour16x16_AArch64_neon, WELS_CPU_NEON, 16, 16)
+#endif
+
+#ifdef HAVE_MMI
+GENERATE_SadFour_UT (WelsSampleSadFour8x8_mmi, WELS_CPU_MMI, 8, 8)
+GENERATE_SadFour_UT (WelsSampleSadFour8x16_mmi, WELS_CPU_MMI, 8, 16)
+GENERATE_SadFour_UT (WelsSampleSadFour16x8_mmi, WELS_CPU_MMI, 16, 8)
+GENERATE_SadFour_UT (WelsSampleSadFour16x16_mmi, WELS_CPU_MMI, 16, 16)
#endif
--- a/test/processing/ProcessUT_VaaCalc.cpp
+++ b/test/processing/ProcessUT_VaaCalc.cpp
@@ -863,3 +863,11 @@
GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_AArch64_neon, 1, WELS_CPU_NEON)
GENERATE_VAACalcSadVar_UT (VAACalcSadVar_AArch64_neon, 1, WELS_CPU_NEON)
#endif
+
+#if defined(HAVE_MMI)
+GENERATE_VAACalcSad_UT (VAACalcSad_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_mmi, 1, WELS_CPU_MMI)
+GENERATE_VAACalcSadVar_UT (VAACalcSadVar_mmi, 1, WELS_CPU_MMI)
+#endif