ref: 3bf69ad5bbb66cf8fed3adf9ef3d5eca8c487c6c
dir: /codec/common/mips/deblock_mmi.c/
/*! * \copy * Copyright (c) 2009-2018, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * * \file deblock_mmi.c * * \brief Loongson optimize * * \date 20/07/2018 Created * ************************************************************************************* */ #include <stdint.h> #include "asmdefs_mmi.h" void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTC) { unsigned char tmp[512] __attribute__((aligned(32))); BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "dsll $8, %[iStride], 0x1 \n\t" "daddu $8, $8, %[iStride] \n\t" "dsubu $14, %[pPix], $8 \n\t" "dsll $8, %[iStride], 0x1 \n\t" "dsubu $9, %[pPix], $8 \n\t" "dmtc1 %[iAlpha], $f0 \n\t" "dsubu $13, %[pPix], %[iStride] \n\t" "daddu %[iStride], %[iStride], %[pPix] \n\t" "daddu $12, $8, %[pPix] \n\t" "punpcklhw $f0, $f0, $f0 \n\t" "lb $8, 0x0(%[pTC]) \n\t" "punpcklwd $f0, $f0, $f0 \n\t" "mov.d $f2, $f0 \n\t" "gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t" "dmtc1 %[iBeta], $f0 \n\t" "lb %[iAlpha], 0x1(%[pTC]) \n\t" "dli %[iBeta], 0xFFFF \n\t" "punpcklhw $f0, $f0, $f0 \n\t" "and $10, %[iAlpha], %[iBeta] \n\t" "punpcklwd $f0, $f0, $f0 \n\t" "mov.d $f2, $f0 \n\t" "and %[iAlpha], %[iAlpha], %[iBeta] \n\t" "dmtc1 $10, $f4 \n\t" "mov.d $f8, $f4 \n\t" "dmtc1 %[iAlpha], $f16 \n\t" "and %[iAlpha], $8, %[iBeta] \n\t" "dmtc1 %[iAlpha], $f20 \n\t" "mov.d $f24, $f20 \n\t" "mov.d $f28, $f20 \n\t" "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f0 \n\t" "lb %[iAlpha], 0x3(%[pTC]) \n\t" "lb %[pTC], 0x2(%[pTC]) \n\t" "dmtc1 $10, $f12 \n\t" "punpcklhw $f0, $f0, $f16 \n\t" "and $8, %[iAlpha], %[iBeta] \n\t" "punpcklhw $f24, $f24, $f8 \n\t" "punpcklhw $f20, $f20, $f4 \n\t" "punpcklhw $f0, $f0, $f24 \n\t" "punpcklhw $f28, $f28, $f12 \n\t" "punpcklhw $f28, $f28, $f20 \n\t" "punpckhhw $f2, $f0, $f28 \n\t" "punpcklhw $f0, $f0, $f28 \n\t" "gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t" "dmtc1 $8, $f0 \n\t" "and %[iAlpha], %[iAlpha], %[iBeta] \n\t" "mov.d $f8, $f0 \n\t" "dmtc1 %[iAlpha], $f16 \n\t" "and %[iAlpha], %[pTC], %[iBeta] \n\t" "dmtc1 $8, $f12 \n\t" "dmtc1 %[iAlpha], $f20 \n\t" "punpcklhw $f20, $f20, $f0 \n\t" "xor $f0, $f0, $f0 \n\t" "dmtc1 %[iAlpha], $f24 \n\t" "and %[pTC], %[pTC], %[iBeta] \n\t" "punpcklhw $f24, $f24, $f8 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "dmtc1 %[pTC], $f4 \n\t" "gslqc1 $f10, $f8, 0x0($9) \n\t" "punpckhbh $f10, $f8, $f0 \n\t" "punpcklbh $f8, $f8, $f0 \n\t" "dli %[iAlpha], 0x4 \n\t" "seh %[pTC], %[iAlpha] \n\t" "punpcklhw $f28, $f28, $f12 \n\t" "punpcklhw $f28, $f28, $f20 \n\t" "gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t" "gslqc1 $f14, $f12, 0x0($13) \n\t" "gsldxc1 $f2, 0x0($12, $0) \n\t" "punpckhbh $f22, $f20, $f0 \n\t" "punpcklbh $f20, $f20, $f0 \n\t" "gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t" "punpckhbh $f22, $f2, $f0 \n\t" "punpcklbh $f20, $f2, $f0 \n\t" "gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t" "punpcklhw $f4, $f4, $f16 \n\t" "gslqc1 $f18, $f16, 0x0($14) \n\t" "punpcklhw $f4, $f4, $f24 \n\t" "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t" "punpckhhw $f6, $f4, $f28 \n\t" "punpcklhw $f4, $f4, $f28 \n\t" "punpckhbh $f26, $f24, $f0 \n\t" "punpcklbh $f24, $f24, $f0 \n\t" "punpckhbh $f14, $f12, $f0 \n\t" "punpcklbh $f12, $f12, $f0 \n\t" "punpckhbh $f18, $f16, $f0 \n\t" "punpcklbh $f16, $f16, $f0 \n\t" "psubh $f28, $f12, $f16 \n\t" "psubh $f30, $f14, $f18 \n\t" "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18) "gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t" "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t" "pcmpgth $f20, $f16, $f28 \n\t" "pcmpgth $f22, $f18, $f30 \n\t" "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t" "psubh $f28, $f24, $f0 \n\t" "psubh $f30, $f26, $f2 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22) "pcmpgth $f20, $f16, $f28 \n\t" "pcmpgth $f22, $f18, $f30 \n\t" "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t" "pavgh $f20, $f12, $f24 \n\t" "pavgh $f22, $f14, $f26 \n\t" "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t" "gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t" "psubh $f20, $f20, $f28 \n\t" "psubh $f22, $f22, $f30 \n\t" "psubh $f20, $f20, $f0 \n\t" "psubh $f22, $f22, $f2 \n\t" "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t" "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t" "psubh $f20, $f24, $f12 \n\t" "psubh $f22, $f26, $f14 \n\t" "gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t" "psubh $f24, $f24, $f0 \n\t" "psubh $f26, $f26, $f2 \n\t" "gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t" WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30) "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t" "pcmpgth $f20, $f20, $f28 \n\t" "pcmpgth $f22, $f22, $f30 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30) "pcmpgth $f28, $f16, $f24 \n\t" "pcmpgth $f30, $f18, $f26 \n\t" "xor $f0, $f0, $f0 \n\t" "and $f20, $f20, $f28 \n\t" "and $f22, $f22, $f30 \n\t" "psubh $f24, $f12, $f8 \n\t" "psubh $f26, $f14, $f10 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30) "pcmpgth $f28, $f16, $f24 \n\t" "pcmpgth $f30, $f18, $f26 \n\t" "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t" "and $f20, $f20, $f28 \n\t" "and $f22, $f22, $f30 \n\t" "pcmpgth $f28, $f24, $f0 \n\t" "pcmpgth $f30, $f26, $f0 \n\t" "pcmpeqh $f24, $f24, $f0 \n\t" "pcmpeqh $f26, $f26, $f0 \n\t" "or $f28, $f28, $f24 \n\t" "or $f30, $f30, $f26 \n\t" "and $f20, $f20, $f28 \n\t" "and $f22, $f22, $f30 \n\t" "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t" "dmtc1 %[pTC], $f20 \n\t" "punpckhhw $f26, $f20, $f20 \n\t" "punpcklhw $f24, $f20, $f20 \n\t" "punpcklwd $f20, $f24, $f24 \n\t" "mov.d $f22, $f20 \n\t" "gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t" "psubh $f24, $f0, $f20 \n\t" "dli $11, 0x2 \n\t" "psubh $f26, $f0, $f22 \n\t" "dmtc1 $11, $f28 \n\t" "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t" "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t" "psllh $f20, $f20, $f28 \n\t" "psllh $f22, $f22, $f28 \n\t" "psubh $f28, $f8, $f0 \n\t" "psubh $f30, $f10, $f2 \n\t" "paddh $f28, $f28, $f20 \n\t" "paddh $f30, $f30, $f22 \n\t" "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t" "paddh $f28, $f28, $f20 \n\t" "paddh $f30, $f30, $f22 \n\t" "dli $11, 0x3 \n\t" "dmtc1 $11, $f20 \n\t" "psrah $f28, $f28, $f20 \n\t" "psrah $f30, $f30, $f20 \n\t" "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t" "pmaxsh $f24, $f24, $f28 \n\t" "pmaxsh $f26, $f26, $f30 \n\t" "gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t" "pminsh $f20, $f20, $f24 \n\t" "pminsh $f22, $f22, $f26 \n\t" "and $f20, $f20, $f0 \n\t" "and $f22, $f22, $f2 \n\t" "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t" "gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t" "xor $f0, $f0, $f0 \n\t" "gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t" "psubh $f20, $f0, $f24 \n\t" "psubh $f22, $f0, $f26 \n\t" "gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t" "mov.d $f24, $f20 \n\t" "mov.d $f26, $f22 \n\t" "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t" "paddh $f20, $f20, $f28 \n\t" "paddh $f22, $f22, $f30 \n\t" "paddh $f28, $f8, $f8 \n\t" "paddh $f30, $f10, $f10 \n\t" "psubh $f20, $f20, $f28 \n\t" "psubh $f22, $f22, $f30 \n\t" "dli $11, 0x1 \n\t" "dmtc1 $11, $f28 \n\t" "psrah $f20, $f20, $f28 \n\t" "psrah $f22, $f22, $f28 \n\t" "pmaxsh $f24, $f24, $f20 \n\t" "pmaxsh $f26, $f26, $f22 \n\t" "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t" "pminsh $f20, $f20, $f24 \n\t" "pminsh $f22, $f22, $f26 \n\t" "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t" "and $f20, $f20, $f24 \n\t" "and $f22, $f22, $f26 \n\t" "and $f20, $f20, $f28 \n\t" "and $f22, $f22, $f30 \n\t" "gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t" "gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t" "paddh $f20, $f20, $f28 \n\t" "paddh $f22, $f22, $f30 \n\t" "paddh $f28, $f24, $f24 \n\t" "paddh $f30, $f26, $f26 \n\t" "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t" "dli $11, 0x1 \n\t" "psubh $f20, $f20, $f28 \n\t" "dmtc1 $11, $f28 \n\t" "psubh $f22, $f22, $f30 \n\t" "psrah $f20, $f20, $f28 \n\t" "psrah $f22, $f22, $f28 \n\t" "gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t" "pmaxsh $f24, $f24, $f20 \n\t" "pmaxsh $f26, $f26, $f22 \n\t" "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t" "pminsh $f20, $f20, $f24 \n\t" "pminsh $f22, $f22, $f26 \n\t" "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t" "and $f20, $f20, $f24 \n\t" "and $f22, $f22, $f26 \n\t" "gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t" "and $f20, $f20, $f24 \n\t" "and $f22, $f22, $f26 \n\t" "gslqc1 $f26, $f24, 0x0($9) \n\t" "punpcklbh $f28, $f30, $f0 \n\t" "punpckhbh $f30, $f30, $f0 \n\t" "gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t" "gslqc1 $f30, $f28, 0x0($12) \n\t" "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t" "gslqc1 $f22, $f20, 0x0($14) \n\t" "gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x0($13) \n\t" "punpcklbh $f28, $f30, $f0 \n\t" "punpckhbh $f30, $f30, $f0 \n\t" "punpcklbh $f20, $f22, $f0 \n\t" "punpckhbh $f22, $f22, $f0 \n\t" "gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t" "psubh $f28, $f28, $f20 \n\t" "psubh $f30, $f30, $f22 \n\t" "gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22) "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "pcmpgth $f20, $f16, $f28 \n\t" "pcmpgth $f22, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t" "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t" "psubh $f28, $f24, $f28 \n\t" "psubh $f30, $f26, $f30 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22) "pcmpgth $f20, $f16, $f28 \n\t" "pcmpgth $f22, $f18, $f30 \n\t" "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t" "gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t" "pavgh $f20, $f20, $f24 \n\t" "pavgh $f22, $f22, $f26 \n\t" "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t" "psubh $f20, $f4, $f20 \n\t" "psubh $f22, $f6, $f22 \n\t" "psubh $f20, $f20, $f28 \n\t" "psubh $f22, $f22, $f30 \n\t" "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t" "psubh $f20, $f24, $f20 \n\t" "psubh $f22, $f26, $f22 \n\t" "psubh $f24, $f24, $f28 \n\t" "psubh $f26, $f26, $f30 \n\t" "gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t" "mov.d $f28, $f20 \n\t" "mov.d $f30, $f22 \n\t" WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2) "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t" "pcmpgth $f20, $f20, $f28 \n\t" "pcmpgth $f22, $f22, $f30 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30) "pcmpgth $f28, $f16, $f24 \n\t" "pcmpgth $f30, $f18, $f26 \n\t" "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t" "and $f20, $f20, $f28 \n\t" "and $f22, $f22, $f30 \n\t" "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t" "psubh $f28, $f28, $f24 \n\t" "psubh $f30, $f30, $f26 \n\t" "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t" "psubh $f24, $f24, $f0 \n\t" "psubh $f26, $f26, $f2 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2) "pcmpgth $f16, $f16, $f28 \n\t" "pcmpgth $f18, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t" "and $f20, $f20, $f16 \n\t" "and $f22, $f22, $f18 \n\t" "xor $f0, $f0, $f0 \n\t" "paddh $f8, $f8, $f28 \n\t" "paddh $f10, $f10, $f30 \n\t" "pcmpgth $f16, $f4, $f0 \n\t" "pcmpgth $f18, $f6, $f0 \n\t" "pcmpeqh $f28, $f4, $f0 \n\t" "pcmpeqh $f30, $f6, $f0 \n\t" "or $f16, $f16, $f28 \n\t" "or $f18, $f18, $f30 \n\t" "and $f20, $f20, $f16 \n\t" "and $f22, $f22, $f18 \n\t" "gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t" "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t" "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t" "dli $11, 0x2 \n\t" "psubh $f28, $f0, $f16 \n\t" "psubh $f30, $f0, $f18 \n\t" "psubh $f2, $f0, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "dmfc1 %[iAlpha], $f28 \n\t" "dmtc1 $11, $f28 \n\t" "psllh $f20, $f20, $f28 \n\t" "psllh $f22, $f22, $f28 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "paddh $f24, $f24, $f20 \n\t" "paddh $f26, $f26, $f22 \n\t" "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t" "paddh $f24, $f24, $f20 \n\t" "paddh $f26, $f26, $f22 \n\t" "gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t" "dli $11, 0x3 \n\t" "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t" "dmfc1 %[iAlpha], $f0 \n\t" "dmtc1 $11, $f0 \n\t" "psrah $f24, $f24, $f0 \n\t" "psrah $f26, $f26, $f0 \n\t" "dmtc1 %[iAlpha], $f0 \n\t" "pmaxsh $f28, $f28, $f24 \n\t" "pmaxsh $f30, $f30, $f26 \n\t" "pminsh $f16, $f16, $f28 \n\t" "pminsh $f18, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t" "and $f16, $f16, $f28 \n\t" "and $f18, $f18, $f30 \n\t" "mov.d $f24, $f0 \n\t" "mov.d $f26, $f2 \n\t" "gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t" "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t" "paddh $f0, $f0, $f28 \n\t" "paddh $f2, $f2, $f30 \n\t" "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t" "gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t" "dli $11, 0x1 \n\t" "paddh $f16, $f16, $f16 \n\t" "paddh $f18, $f18, $f18 \n\t" "psubh $f0, $f0, $f16 \n\t" "psubh $f2, $f2, $f18 \n\t" "dmtc1 $11, $f28 \n\t" "gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t" "psrah $f0, $f0, $f28 \n\t" "psrah $f2, $f2, $f28 \n\t" "pmaxsh $f24, $f24, $f0 \n\t" "pmaxsh $f26, $f26, $f2 \n\t" "gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t" "pminsh $f28, $f4, $f24 \n\t" "pminsh $f30, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t" "and $f28, $f28, $f24 \n\t" "and $f30, $f30, $f26 \n\t" "dmfc1 %[iAlpha], $f24 \n\t" "dmfc1 %[iBeta], $f26 \n\t" "gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t" "and $f28, $f28, $f24 \n\t" "and $f30, $f30, $f26 \n\t" "paddh $f20, $f20, $f28 \n\t" "paddh $f22, $f22, $f30 \n\t" "packushb $f8, $f8, $f10 \n\t" "packushb $f10, $f20, $f22 \n\t" "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t" "paddh $f0, $f0, $f20 \n\t" "paddh $f2, $f2, $f22 \n\t" "paddh $f12, $f12, $f16 \n\t" "paddh $f14, $f14, $f18 \n\t" "packushb $f12, $f12, $f14 \n\t" "packushb $f14, $f0, $f2 \n\t" "gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t" "psubh $f0, $f0, $f16 \n\t" "psubh $f2, $f2, $f18 \n\t" "gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t" "psubh $f16, $f16, $f20 \n\t" "gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t" "psubh $f18, $f18, $f22 \n\t" "gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t" "paddh $f20, $f20, $f24 \n\t" "paddh $f22, $f22, $f26 \n\t" "gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t" "packushb $f0, $f0, $f2 \n\t" "packushb $f2, $f16, $f18 \n\t" "gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t" "paddh $f16, $f16, $f24 \n\t" "paddh $f18, $f18, $f26 \n\t" "gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t" "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t" "mov.d $f28, $f0 \n\t" "mov.d $f30, $f2 \n\t" "paddh $f0, $f0, $f0 \n\t" "paddh $f2, $f2, $f2 \n\t" "dmtc1 %[iAlpha], $f24 \n\t" "dmtc1 %[iBeta], $f26 \n\t" "psubh $f16, $f16, $f0 \n\t" "psubh $f18, $f18, $f2 \n\t" "dli $11, 0x1 \n\t" "gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t" "gssqc1 $f10, $f8, 0x0($9) \n\t" "dmtc1 $11, $f8 \n\t" "psrah $f16, $f16, $f8 \n\t" "psrah $f18, $f18, $f8 \n\t" "pmaxsh $f0, $f0, $f16 \n\t" "pmaxsh $f2, $f2, $f18 \n\t" "pminsh $f4, $f4, $f0 \n\t" "pminsh $f6, $f6, $f2 \n\t" "gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t" "gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t" "and $f4, $f4, $f24 \n\t" "and $f6, $f6, $f26 \n\t" "and $f4, $f4, $f8 \n\t" "and $f6, $f6, $f10 \n\t" "gssqc1 $f14, $f12, 0x0($13) \n\t" "paddh $f28, $f28, $f4 \n\t" "paddh $f30, $f30, $f6 \n\t" "packushb $f20, $f20, $f22 \n\t" "packushb $f22, $f28, $f30 \n\t" "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t" "gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t" : [pPix]"+&r"((unsigned char *)pPix) : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta), [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp) : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride, uint8_t *pDst) { BACKUP_REG; __asm__ volatile( ".set arch=loongson3a \n\t" "dsll $8, %[iStride], 0x3 \n\t" "daddu $8, $8, %[pPixY] \n\t" "daddu $9, %[pPixY], %[iStride] \n\t" "daddu $10, $8, %[iStride] \n\t" "gsldlc1 $f0, 0x7(%[pPixY]) \n\t" "gsldlc1 $f2, 0x7($8) \n\t" "gsldlc1 $f4, 0x7($9) \n\t" "gsldlc1 $f6, 0x7($10) \n\t" "gsldrc1 $f0, 0x0(%[pPixY]) \n\t" "gsldrc1 $f2, 0x0($8) \n\t" "gsldrc1 $f4, 0x0($9) \n\t" "gsldrc1 $f6, 0x0($10) \n\t" "daddu %[pPixY], $9, %[iStride] \n\t" "daddu $8, $10, %[iStride] \n\t" "daddu $9, %[pPixY], %[iStride] \n\t" "daddu $10, $8, %[iStride] \n\t" "gsldlc1 $f8, 0x7(%[pPixY]) \n\t" "gsldlc1 $f10, 0x7($8) \n\t" "gsldlc1 $f12, 0x7($9) \n\t" "gsldlc1 $f14, 0x7($10) \n\t" "gsldrc1 $f8, 0x0(%[pPixY]) \n\t" "gsldrc1 $f10, 0x0($8) \n\t" "gsldrc1 $f12, 0x0($9) \n\t" "gsldrc1 $f14, 0x0($10) \n\t" "daddu %[pPixY], $9, %[iStride] \n\t" "daddu $8, $10, %[iStride] \n\t" "daddu $9, %[pPixY], %[iStride] \n\t" "daddu $10, $8, %[iStride] \n\t" "gsldlc1 $f16, 0x7(%[pPixY]) \n\t" "gsldlc1 $f18, 0x7($8) \n\t" "gsldlc1 $f20, 0x7($9) \n\t" "gsldlc1 $f22, 0x7($10) \n\t" "gsldrc1 $f16, 0x0(%[pPixY]) \n\t" "gsldrc1 $f18, 0x0($8) \n\t" "gsldrc1 $f20, 0x0($9) \n\t" "gsldrc1 $f22, 0x0($10) \n\t" "daddu %[pPixY], $9, %[iStride] \n\t" "daddu $8, $10, %[iStride] \n\t" "daddu $9, %[pPixY], %[iStride] \n\t" "daddu $10, $8, %[iStride] \n\t" "gsldlc1 $f24, 0x7(%[pPixY]) \n\t" "gsldlc1 $f26, 0x7($8) \n\t" "gsldlc1 $f28, 0x7($9) \n\t" "gsldlc1 $f30, 0x7($10) \n\t" "gsldrc1 $f24, 0x0(%[pPixY]) \n\t" "gsldrc1 $f26, 0x0($8) \n\t" "gsldrc1 $f28, 0x0($9) \n\t" "gsldrc1 $f30, 0x0($10) \n\t" MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $9, $10) "gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t" "gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t" "gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t" "gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t" "gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t" "gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t" "gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t" "gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t" : [pPixY] "+&r"((unsigned char *)pPixY) : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst) : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride, uint8_t *pSrc) { BACKUP_REG; __asm__ volatile( ".set arch=loongson3a \n\t" "gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t" "gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t" "gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t" "gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t" "gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t" "gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t" "gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t" "gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t" MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $9, $10) "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f16, 0x7(%[pPixY]) \n\t" "gssdlc1 $f8, 0x7($8) \n\t" "gssdrc1 $f16, 0x0(%[pPixY]) \n\t" "gssdrc1 $f8, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f12, 0x7(%[pPixY]) \n\t" "gssdlc1 $f28, 0x7($8) \n\t" "gssdrc1 $f12, 0x0(%[pPixY]) \n\t" "gssdrc1 $f28, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f20, 0x7(%[pPixY]) \n\t" "gssdlc1 $f4, 0x7($8) \n\t" "gssdrc1 $f20, 0x0(%[pPixY]) \n\t" "gssdrc1 $f4, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f24, 0x7(%[pPixY]) \n\t" "gssdlc1 $f0, 0x7($8) \n\t" "gssdrc1 $f24, 0x0(%[pPixY]) \n\t" "gssdrc1 $f0, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f18, 0x7(%[pPixY]) \n\t" "gssdlc1 $f10, 0x7($8) \n\t" "gssdrc1 $f18, 0x0(%[pPixY]) \n\t" "gssdrc1 $f10, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f14, 0x7(%[pPixY]) \n\t" "gssdlc1 $f30, 0x7($8) \n\t" "gssdrc1 $f14, 0x0(%[pPixY]) \n\t" "gssdrc1 $f30, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f22, 0x7(%[pPixY]) \n\t" "gssdlc1 $f6, 0x7($8) \n\t" "gssdrc1 $f22, 0x0(%[pPixY]) \n\t" "gssdrc1 $f6, 0x0($8) \n\t" "daddu %[pPixY], $8, %[iStride] \n\t" "daddu $8, %[pPixY], %[iStride] \n\t" "gssdlc1 $f26, 0x7(%[pPixY]) \n\t" "gssdlc1 $f2, 0x7($8) \n\t" "gssdrc1 $f26, 0x0(%[pPixY]) \n\t" "gssdrc1 $f2, 0x0($8) \n\t" : [pPixY] "+&r"((unsigned char *)pPixY) : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc) : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) { unsigned char tmp[720] __attribute__((aligned(32))); BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "dsll $11, %[iStride], 0x2 \n\t" "xor $f8, $f8, $f8 \n\t" "daddu $14, %[iStride], %[pPix] \n\t" "dsubu $8, %[pPix], $11 \n\t" "gslqc1 $f14, $f12, 0x0($8) \n\t" "gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t" "daddu $9, %[iStride], %[iStride] \n\t" "daddu $10, $9, %[iStride] \n\t" "move $12, $9 \n\t" "dsubu $8, %[pPix], $9 \n\t" "gslqc1 $f6, $f4, 0x0($8) \n\t" "dsubu $9, %[pPix], %[iStride] \n\t" "gslqc1 $f18, $f16, 0x0($9) \n\t" "daddu $13, %[iStride], %[pPix] \n\t" "move %[iStride], $12 \n\t" "daddu $15, $12, %[pPix] \n\t" "daddu $12, %[pPix], $10 \n\t" "dsubu $11, %[pPix], $10 \n\t" "gslqc1 $f26, $f24, 0x0($11) \n\t" "daddu %[iStride], %[iStride], %[pPix] \n\t" "dmtc1 %[iAlpha], $f0 \n\t" "punpcklhw $f28, $f0, $f0 \n\t" "punpcklwd $f0, $f28, $f28 \n\t" "mov.d $f2, $f0 \n\t" "gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t" "dmtc1 %[iBeta], $f0 \n\t" "gsldxc1 $f10, 0x0($15, $0) \n\t" "punpcklhw $f28, $f0, $f0 \n\t" "punpcklwd $f0, $f28, $f28 \n\t" "punpckhbh $f30, $f10, $f8 \n\t" "mov.d $f2, $f0 \n\t" "punpcklbh $f28, $f10, $f8 \n\t" "gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t" "gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t" "mov.d $f0, $f4 \n\t" "gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t" "gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t" "mov.d $f4, $f16 \n\t" "punpckhbh $f22, $f20, $f8 \n\t" "punpcklbh $f20, $f20, $f8 \n\t" "punpckhbh $f6, $f4, $f8 \n\t" "punpcklbh $f4, $f4, $f8 \n\t" "psubh $f28, $f20, $f4 \n\t" "psubh $f30, $f22, $f6 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10) "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t" "punpckhbh $f2, $f0, $f8 \n\t" "punpcklbh $f0, $f0, $f8 \n\t" "gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t" "gslqc1 $f18, $f16, 0x0($14) \n\t" "gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t" "psubh $f28, $f4, $f0 \n\t" "psubh $f30, $f6, $f2 \n\t" "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10) "punpckhbh $f18, $f16, $f8 \n\t" "punpcklbh $f16, $f16, $f8 \n\t" "pcmpgth $f0, $f0, $f28 \n\t" "pcmpgth $f2, $f2, $f30 \n\t" "gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t" "psubh $f28, $f20, $f16 \n\t" "psubh $f30, $f22, $f18 \n\t" "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t" "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t" "punpckhbh $f26, $f24, $f8 \n\t" "punpcklbh $f24, $f24, $f8 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t" "gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t" "gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t" "pcmpgth $f16, $f16, $f28 \n\t" "pcmpgth $f18, $f18, $f30 \n\t" "and $f0, $f0, $f16 \n\t" "and $f2, $f2, $f18 \n\t" "gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t" "dli %[iAlpha], 0x2 \n\t" "dli %[iBeta], 0x2 \n\t" "pcmpgth $f16, $f16, $f28 \n\t" "pcmpgth $f18, $f18, $f30 \n\t" "and $f0, $f0, $f16 \n\t" "and $f2, $f2, $f18 \n\t" "dmtc1 %[iAlpha], $f16 \n\t" "dmtc1 %[iBeta], $f10 \n\t" "gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t" "gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t" "punpcklhw $f28, $f16, $f16 \n\t" "psrah $f16, $f0, $f10 \n\t" "psrah $f18, $f2, $f10 \n\t" "punpcklwd $f28, $f28, $f28 \n\t" "mov.d $f30, $f28 \n\t" "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t" "paddh $f16, $f16, $f28 \n\t" "paddh $f18, $f18, $f30 \n\t" "gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t" "pcmpgth $f16, $f16, $f8 \n\t" "pcmpgth $f18, $f18, $f10 \n\t" "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t" "gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t" "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t" "psubh $f28, $f4, $f24 \n\t" "psubh $f30, $f6, $f26 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t" "pcmpgth $f16, $f16, $f28 \n\t" "pcmpgth $f18, $f18, $f30 \n\t" "gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t" "and $f16, $f16, $f8 \n\t" "and $f18, $f18, $f10 \n\t" "gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t" "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t" "psubh $f28, $f20, $f0 \n\t" "psubh $f30, $f22, $f2 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t" "pcmpgth $f16, $f16, $f28 \n\t" "pcmpgth $f18, $f18, $f30 \n\t" "and $f16, $f16, $f8 \n\t" "and $f18, $f18, $f10 \n\t" "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t" "gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t" "xor $f8, $f8, $f8 \n\t" "pandn $f16, $f16, $f24 \n\t" "dli %[iAlpha], 0x4 \n\t" "pandn $f18, $f18, $f26 \n\t" "gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f16 \n\t" "punpcklhw $f28, $f16, $f16 \n\t" "dli %[iAlpha], 0x1 \n\t" "punpckhbh $f18, $f12, $f8 \n\t" "dmtc1 %[iAlpha], $f30 \n\t" "punpcklbh $f16, $f12, $f8 \n\t" "psllh $f16, $f16, $f30 \n\t" "psllh $f18, $f18, $f30 \n\t" "paddh $f16, $f16, $f24 \n\t" "paddh $f18, $f18, $f26 \n\t" "gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t" "paddh $f16, $f16, $f24 \n\t" "paddh $f18, $f18, $f26 \n\t" "paddh $f16, $f16, $f24 \n\t" "paddh $f18, $f18, $f26 \n\t" "paddh $f16, $f16, $f0 \n\t" "paddh $f18, $f18, $f2 \n\t" "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t" "punpcklwd $f28, $f28, $f28 \n\t" "mov.d $f30, $f28 \n\t" "paddh $f16, $f16, $f4 \n\t" "paddh $f18, $f18, $f6 \n\t" "gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t" "paddh $f16, $f16, $f20 \n\t" "paddh $f18, $f18, $f22 \n\t" "paddh $f16, $f16, $f28 \n\t" "paddh $f18, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t" "gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t" "pandn $f24, $f24, $f28 \n\t" "pandn $f26, $f26, $f30 \n\t" "gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x0($12) \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "punpckhbh $f26, $f24, $f8 \n\t" "punpcklbh $f24, $f24, $f8 \n\t" "psllh $f24, $f24, $f10 \n\t" "psllh $f26, $f26, $f10 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "paddh $f24, $f24, $f0 \n\t" "paddh $f26, $f26, $f2 \n\t" "dli %[iAlpha], 0x3 \n\t" "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t" "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t" "paddh $f24, $f24, $f20 \n\t" "paddh $f26, $f26, $f22 \n\t" "paddh $f24, $f24, $f4 \n\t" "paddh $f26, $f26, $f6 \n\t" "paddh $f24, $f24, $f0 \n\t" "paddh $f26, $f26, $f2 \n\t" "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "psrah $f24, $f24, $f10 \n\t" "psrah $f26, $f26, $f10 \n\t" "and $f24, $f24, $f0 \n\t" "and $f26, $f26, $f2 \n\t" "gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t" "pandn $f24, $f24, $f28 \n\t" "pandn $f26, $f26, $f30 \n\t" "gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t" "gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t" "gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "psrah $f16, $f16, $f10 \n\t" "psrah $f18, $f18, $f10 \n\t" "and $f16, $f16, $f0 \n\t" "and $f18, $f18, $f2 \n\t" "gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t" "paddh $f28, $f4, $f20 \n\t" "paddh $f30, $f6, $f22 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "paddh $f24, $f24, $f0 \n\t" "paddh $f26, $f26, $f2 \n\t" "gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t" "dli %[iAlpha], 0x2 \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "paddh $f20, $f20, $f4 \n\t" "paddh $f22, $f22, $f6 \n\t" "psrah $f24, $f24, $f10 \n\t" "psrah $f26, $f26, $f10 \n\t" "and $f28, $f28, $f24 \n\t" "and $f30, $f30, $f26 \n\t" "gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t" "gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t" "pandn $f28, $f28, $f24 \n\t" "pandn $f30, $f30, $f26 \n\t" "gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t" "paddh $f28, $f28, $f24 \n\t" "paddh $f30, $f30, $f26 \n\t" "paddh $f28, $f28, $f20 \n\t" "paddh $f30, $f30, $f22 \n\t" "paddh $f28, $f28, $f8 \n\t" "paddh $f30, $f30, $f10 \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t" "psrah $f28, $f28, $f10 \n\t" "psrah $f30, $f30, $f10 \n\t" "and $f20, $f20, $f28 \n\t" "and $f22, $f22, $f30 \n\t" "gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t" "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t" "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t" "paddh $f28, $f20, $f20 \n\t" "paddh $f30, $f22, $f22 \n\t" "paddh $f20, $f4, $f24 \n\t" "paddh $f22, $f6, $f26 \n\t" "paddh $f24, $f24, $f0 \n\t" "paddh $f26, $f26, $f2 \n\t" "paddh $f28, $f28, $f20 \n\t" "paddh $f30, $f30, $f22 \n\t" "paddh $f28, $f28, $f8 \n\t" "paddh $f30, $f30, $f10 \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t" "psrah $f28, $f28, $f10 \n\t" "psrah $f30, $f30, $f10 \n\t" "dli %[iAlpha], 0x1 \n\t" "pandn $f20, $f20, $f28 \n\t" "pandn $f22, $f22, $f30 \n\t" "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t" "paddh $f28, $f28, $f4 \n\t" "paddh $f30, $f30, $f6 \n\t" "gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t" "paddh $f28, $f28, $f4 \n\t" "paddh $f30, $f30, $f6 \n\t" "gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t" "gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t" "psllh $f28, $f28, $f10 \n\t" "psllh $f30, $f30, $f10 \n\t" "dli %[iAlpha], 0x3 \n\t" "paddh $f28, $f28, $f24 \n\t" "paddh $f30, $f30, $f26 \n\t" "paddh $f20, $f20, $f28 \n\t" "paddh $f22, $f22, $f30 \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "dli %[iAlpha], 0x2 \n\t" "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t" "psrah $f20, $f20, $f10 \n\t" "psrah $f22, $f22, $f10 \n\t" "and $f4, $f4, $f20 \n\t" "and $f6, $f6, $f22 \n\t" "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t" "gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t" "gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t" "paddh $f24, $f4, $f4 \n\t" "paddh $f26, $f6, $f6 \n\t" "paddh $f4, $f4, $f8 \n\t" "paddh $f6, $f6, $f10 \n\t" "gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t" "paddh $f28, $f28, $f20 \n\t" "paddh $f30, $f30, $f22 \n\t" "paddh $f4, $f4, $f8 \n\t" "paddh $f6, $f6, $f10 \n\t" "gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "paddh $f20, $f20, $f8 \n\t" "paddh $f22, $f22, $f10 \n\t" "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t" "paddh $f24, $f24, $f8 \n\t" "dmtc1 %[iAlpha], $f8 \n\t" "paddh $f26, $f26, $f10 \n\t" "dli %[iAlpha], 0x1 \n\t" "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "psrah $f24, $f24, $f8 \n\t" "psrah $f26, $f26, $f8 \n\t" "psllh $f4, $f4, $f10 \n\t" "psllh $f6, $f6, $f10 \n\t" "paddh $f4, $f4, $f20 \n\t" "paddh $f6, $f6, $f22 \n\t" "dli %[iAlpha], 0x3 \n\t" "gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t" "pandn $f28, $f28, $f24 \n\t" "pandn $f30, $f30, $f26 \n\t" "gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "paddh $f24, $f24, $f4 \n\t" "paddh $f26, $f26, $f6 \n\t" "gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t" "psrah $f24, $f24, $f10 \n\t" "psrah $f26, $f26, $f10 \n\t" "and $f4, $f4, $f24 \n\t" "and $f6, $f6, $f26 \n\t" "xor $f8, $f8, $f8 \n\t" "gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t" "gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t" "gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t" "punpcklbh $f4, $f6, $f8 \n\t" "punpckhbh $f6, $f6, $f8 \n\t" "gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t" "gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t" "punpcklbh $f4, $f6, $f8 \n\t" "punpckhbh $f6, $f6, $f8 \n\t" "punpcklbh $f24, $f26, $f8 \n\t" "punpckhbh $f26, $f26, $f8 \n\t" "gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t" "punpcklbh $f20, $f22, $f8 \n\t" "punpckhbh $f22, $f22, $f8 \n\t" "gslqc1 $f30, $f28, 0x0($14) \n\t" "gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t" "gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t" "gsldxc1 $f0, 0x8($15, $0) \n\t" "punpcklbh $f28, $f30, $f8 \n\t" "punpckhbh $f30, $f30, $f8 \n\t" "gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t" "punpcklbh $f28, $f0, $f8 \n\t" "punpckhbh $f30, $f0, $f8 \n\t" "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t" "gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t" "psubh $f28, $f24, $f4 \n\t" "psubh $f30, $f26, $f6 \n\t" "psubh $f24, $f24, $f8 \n\t" "psubh $f26, $f26, $f10 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t" "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t" "or $f16, $f16, $f8 \n\t" "or $f18, $f18, $f10 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30) "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t" "psubh $f28, $f4, $f28 \n\t" "psubh $f30, $f6, $f30 \n\t" "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "pcmpgth $f4, $f0, $f28 \n\t" "pcmpgth $f6, $f2, $f30 \n\t" "pcmpgth $f28, $f0, $f24 \n\t" "pcmpgth $f30, $f2, $f26 \n\t" "gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t" "and $f4, $f4, $f28 \n\t" "and $f6, $f6, $f30 \n\t" "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t" "pcmpgth $f24, $f24, $f28 \n\t" "pcmpgth $f26, $f26, $f30 \n\t" "and $f4, $f4, $f24 \n\t" "and $f6, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t" "pcmpgth $f24, $f24, $f28 \n\t" "pcmpgth $f26, $f26, $f30 \n\t" "xor $f8, $f8, $f8 \n\t" "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t" "punpcklbh $f12, $f14, $f8 \n\t" "punpckhbh $f14, $f14, $f8 \n\t" "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t" "psubh $f28, $f28, $f20 \n\t" "psubh $f30, $f30, $f22 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "pcmpgth $f24, $f24, $f28 \n\t" "pcmpgth $f26, $f26, $f30 \n\t" "dli %[iAlpha], 0x1 \n\t" "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t" "and $f24, $f24, $f8 \n\t" "and $f26, $f26, $f10 \n\t" "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t" "psubh $f28, $f28, $f8 \n\t" "psubh $f30, $f30, $f10 \n\t" "dmtc1 %[iAlpha], $f10 \n\t" "psllh $f12, $f12, $f10 \n\t" "psllh $f14, $f14, $f10 \n\t" "gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t" "paddh $f12, $f12, $f20 \n\t" "paddh $f14, $f14, $f22 \n\t" "paddh $f12, $f12, $f20 \n\t" "paddh $f14, $f14, $f22 \n\t" "paddh $f12, $f12, $f20 \n\t" "paddh $f14, $f14, $f22 \n\t" "paddh $f12, $f12, $f8 \n\t" "paddh $f14, $f14, $f10 \n\t" "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t" "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t" "paddh $f12, $f12, $f8 \n\t" "paddh $f14, $f14, $f10 \n\t" WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10) "pcmpgth $f24, $f24, $f28 \n\t" "pcmpgth $f26, $f26, $f30 \n\t" "and $f24, $f24, $f0 \n\t" "and $f26, $f26, $f2 \n\t" "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t" "gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t" "dli %[iAlpha], 0x3 \n\t" "gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t" "and $f24, $f0, $f16 \n\t" "and $f26, $f2, $f18 \n\t" "pandn $f16, $f0, $f28 \n\t" "pandn $f18, $f2, $f30 \n\t" "or $f24, $f24, $f16 \n\t" "or $f26, $f26, $f18 \n\t" "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t" "paddh $f12, $f12, $f16 \n\t" "paddh $f14, $f14, $f18 \n\t" "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t" "paddh $f12, $f12, $f28 \n\t" "paddh $f14, $f14, $f30 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "psrah $f12, $f12, $f28 \n\t" "psrah $f14, $f14, $f28 \n\t" "and $f12, $f12, $f8 \n\t" "and $f14, $f14, $f10 \n\t" "pandn $f8, $f8, $f20 \n\t" "pandn $f10, $f10, $f22 \n\t" "or $f12, $f12, $f8 \n\t" "or $f14, $f14, $f10 \n\t" "and $f28, $f4, $f12 \n\t" "and $f30, $f6, $f14 \n\t" "gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t" "or $f12, $f12, $f8 \n\t" "or $f14, $f14, $f10 \n\t" "pandn $f8, $f4, $f20 \n\t" "pandn $f10, $f6, $f22 \n\t" "or $f28, $f28, $f8 \n\t" "or $f30, $f30, $f10 \n\t" "dli %[iAlpha], 0x2 \n\t" "and $f8, $f0, $f12 \n\t" "and $f10, $f2, $f14 \n\t" "gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t" "pandn $f12, $f0, $f12 \n\t" "pandn $f14, $f2, $f14 \n\t" "or $f8, $f8, $f12 \n\t" "or $f10, $f10, $f14 \n\t" "packushb $f24, $f24, $f26 \n\t" "packushb $f26, $f28, $f30 \n\t" "gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t" "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t" "paddh $f8, $f20, $f8 \n\t" "paddh $f10, $f22, $f10 \n\t" "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t" "paddh $f28, $f28, $f16 \n\t" "paddh $f30, $f30, $f18 \n\t" "paddh $f8, $f8, $f28 \n\t" "paddh $f10, $f10, $f30 \n\t" "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t" "paddh $f8, $f8, $f28 \n\t" "paddh $f10, $f10, $f30 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "psrah $f8, $f8, $f28 \n\t" "psrah $f10, $f10, $f28 \n\t" "dli %[iAlpha], 0x1 \n\t" "gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t" "and $f24, $f24, $f8 \n\t" "and $f26, $f26, $f10 \n\t" "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t" "pandn $f28, $f28, $f8 \n\t" "pandn $f30, $f30, $f10 \n\t" "or $f24, $f24, $f28 \n\t" "or $f26, $f26, $f30 \n\t" "and $f12, $f4, $f24 \n\t" "and $f14, $f6, $f26 \n\t" "pandn $f24, $f4, $f8 \n\t" "pandn $f26, $f6, $f10 \n\t" "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t" "paddh $f8, $f8, $f28 \n\t" "paddh $f10, $f10, $f30 \n\t" "paddh $f8, $f8, $f16 \n\t" "paddh $f10, $f10, $f18 \n\t" "or $f12, $f12, $f24 \n\t" "or $f14, $f14, $f26 \n\t" "gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "packushb $f24, $f24, $f26 \n\t" "packushb $f26, $f12, $f14 \n\t" "psllh $f8, $f8, $f28 \n\t" "psllh $f10, $f10, $f28 \n\t" "gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t" "gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t" "or $f24, $f24, $f28 \n\t" "or $f26, $f26, $f30 \n\t" "dli %[iAlpha], 0x3 \n\t" "and $f12, $f0, $f24 \n\t" "and $f14, $f2, $f26 \n\t" "gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t" "pandn $f24, $f0, $f24 \n\t" "pandn $f26, $f2, $f26 \n\t" "or $f12, $f12, $f24 \n\t" "or $f14, $f14, $f26 \n\t" "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t" "gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t" "gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t" "paddh $f12, $f12, $f28 \n\t" "paddh $f14, $f14, $f30 \n\t" "paddh $f8, $f8, $f12 \n\t" "paddh $f10, $f10, $f14 \n\t" "gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t" "paddh $f20, $f20, $f8 \n\t" "paddh $f22, $f22, $f10 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t" "psrah $f20, $f20, $f28 \n\t" "psrah $f22, $f22, $f28 \n\t" "and $f24, $f24, $f20 \n\t" "and $f26, $f26, $f22 \n\t" "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t" "paddh $f8, $f8, $f20 \n\t" "paddh $f10, $f10, $f22 \n\t" "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t" "dli %[iAlpha], 0x2 \n\t" "paddh $f20, $f20, $f28 \n\t" "paddh $f22, $f22, $f30 \n\t" "paddh $f16, $f12, $f12 \n\t" "paddh $f18, $f14, $f14 \n\t" "paddh $f16, $f16, $f8 \n\t" "paddh $f18, $f18, $f10 \n\t" "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t" "paddh $f16, $f16, $f28 \n\t" "paddh $f18, $f18, $f30 \n\t" "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t" "paddh $f12, $f12, $f28 \n\t" "paddh $f14, $f14, $f30 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "psrah $f16, $f16, $f28 \n\t" "psrah $f18, $f18, $f28 \n\t" "pandn $f8, $f8, $f16 \n\t" "pandn $f10, $f10, $f18 \n\t" "or $f24, $f24, $f8 \n\t" "or $f26, $f26, $f10 \n\t" "and $f28, $f4, $f24 \n\t" "and $f30, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t" "pandn $f8, $f4, $f24 \n\t" "pandn $f10, $f6, $f26 \n\t" "or $f28, $f28, $f8 \n\t" "or $f30, $f30, $f10 \n\t" "gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t" "packushb $f8, $f8, $f10 \n\t" "packushb $f10, $f28, $f30 \n\t" "gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t" "or $f8, $f8, $f28 \n\t" "or $f10, $f10, $f30 \n\t" "dli %[iAlpha], 0x1 \n\t" "and $f16, $f0, $f8 \n\t" "and $f18, $f2, $f10 \n\t" "paddh $f20, $f20, $f24 \n\t" "paddh $f22, $f22, $f26 \n\t" "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t" "pandn $f8, $f0, $f28 \n\t" "pandn $f10, $f2, $f30 \n\t" "or $f16, $f16, $f8 \n\t" "or $f18, $f18, $f10 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t" "dli %[iAlpha], 0x3 \n\t" "psllh $f20, $f20, $f28 \n\t" "psllh $f22, $f22, $f28 \n\t" "paddh $f20, $f20, $f12 \n\t" "paddh $f22, $f22, $f14 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t" "paddh $f8, $f8, $f20 \n\t" "paddh $f10, $f10, $f22 \n\t" "psrah $f8, $f8, $f28 \n\t" "psrah $f10, $f10, $f28 \n\t" "gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t" "gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t" "and $f16, $f16, $f8 \n\t" "and $f18, $f18, $f10 \n\t" "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t" "paddh $f20, $f8, $f8 \n\t" "paddh $f22, $f10, $f10 \n\t" "gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t" "paddh $f8, $f8, $f28 \n\t" "paddh $f10, $f10, $f30 \n\t" "dli %[iAlpha], 0x2 \n\t" "paddh $f20, $f20, $f8 \n\t" "paddh $f22, $f22, $f10 \n\t" "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t" "paddh $f20, $f20, $f28 \n\t" "paddh $f22, $f22, $f30 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t" "psrah $f20, $f20, $f28 \n\t" "psrah $f22, $f22, $f28 \n\t" "pandn $f12, $f12, $f20 \n\t" "pandn $f14, $f14, $f22 \n\t" "or $f16, $f16, $f12 \n\t" "or $f18, $f18, $f14 \n\t" "gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t" "or $f12, $f12, $f28 \n\t" "or $f14, $f14, $f30 \n\t" "and $f28, $f4, $f16 \n\t" "and $f30, $f6, $f18 \n\t" "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t" "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t" "pandn $f8, $f4, $f16 \n\t" "pandn $f10, $f6, $f18 \n\t" "or $f28, $f28, $f8 \n\t" "or $f30, $f30, $f10 \n\t" "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t" "paddh $f16, $f16, $f8 \n\t" "paddh $f18, $f18, $f10 \n\t" "gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t" "packushb $f8, $f8, $f10 \n\t" "packushb $f10, $f28, $f30 \n\t" "dli %[iAlpha], 0x2 \n\t" "gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t" "and $f8, $f0, $f12 \n\t" "and $f10, $f2, $f14 \n\t" "gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t" "pandn $f12, $f0, $f28 \n\t" "pandn $f14, $f2, $f30 \n\t" "or $f8, $f8, $f12 \n\t" "or $f10, $f10, $f14 \n\t" "gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t" "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t" "gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t" "paddh $f12, $f8, $f28 \n\t" "paddh $f14, $f10, $f30 \n\t" "paddh $f12, $f12, $f16 \n\t" "paddh $f14, $f14, $f18 \n\t" "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t" "paddh $f12, $f12, $f28 \n\t" "paddh $f14, $f14, $f30 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "psrah $f12, $f12, $f28 \n\t" "psrah $f14, $f14, $f28 \n\t" "and $f24, $f24, $f12 \n\t" "and $f26, $f26, $f14 \n\t" "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t" "pandn $f16, $f12, $f20 \n\t" "pandn $f18, $f14, $f22 \n\t" "or $f24, $f24, $f16 \n\t" "or $f26, $f26, $f18 \n\t" "and $f28, $f4, $f24 \n\t" "and $f30, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t" "pandn $f16, $f4, $f20 \n\t" "pandn $f18, $f6, $f22 \n\t" "or $f28, $f28, $f16 \n\t" "or $f30, $f30, $f18 \n\t" "dli %[iAlpha], 0x1 \n\t" "packushb $f24, $f24, $f26 \n\t" "packushb $f26, $f28, $f30 \n\t" "gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t" "gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t" "or $f28, $f28, $f16 \n\t" "or $f30, $f30, $f18 \n\t" "and $f16, $f0, $f28 \n\t" "and $f18, $f2, $f30 \n\t" "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t" "pandn $f0, $f0, $f28 \n\t" "pandn $f2, $f2, $f30 \n\t" "or $f16, $f16, $f0 \n\t" "or $f18, $f18, $f2 \n\t" "xor $f28, $f28, $f28 \n\t" "xor $f30, $f30, $f30 \n\t" "gslqc1 $f2, $f0, 0x0($12) \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "punpcklbh $f0, $f2, $f30 \n\t" "punpckhbh $f2, $f2, $f30 \n\t" "psllh $f0, $f0, $f28 \n\t" "psllh $f2, $f2, $f28 \n\t" "paddh $f0, $f0, $f8 \n\t" "paddh $f2, $f2, $f10 \n\t" "paddh $f0, $f0, $f8 \n\t" "paddh $f2, $f2, $f10 \n\t" "paddh $f0, $f0, $f8 \n\t" "paddh $f2, $f2, $f10 \n\t" "paddh $f0, $f0, $f20 \n\t" "paddh $f2, $f2, $f22 \n\t" "dli %[iAlpha], 0x3 \n\t" "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t" "paddh $f0, $f0, $f28 \n\t" "paddh $f2, $f2, $f30 \n\t" "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t" "paddh $f0, $f0, $f28 \n\t" "paddh $f2, $f2, $f30 \n\t" "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t" "paddh $f0, $f0, $f28 \n\t" "paddh $f2, $f2, $f30 \n\t" "dmtc1 %[iAlpha], $f28 \n\t" "psrah $f0, $f0, $f28 \n\t" "psrah $f2, $f2, $f28 \n\t" "and $f0, $f0, $f12 \n\t" "and $f2, $f2, $f14 \n\t" "pandn $f12, $f12, $f8 \n\t" "pandn $f14, $f14, $f10 \n\t" "or $f0, $f0, $f12 \n\t" "or $f2, $f2, $f14 \n\t" "and $f28, $f4, $f0 \n\t" "and $f30, $f6, $f2 \n\t" "gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t" "gssqc1 $f2, $f0, 0x0($11) \n\t" "gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t" "gssqc1 $f2, $f0, 0x0($8) \n\t" "gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t" "gssqc1 $f2, $f0, 0x0($9) \n\t" "gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t" "pandn $f4, $f4, $f8 \n\t" "pandn $f6, $f6, $f10 \n\t" "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t" "or $f28, $f28, $f4 \n\t" "or $f30, $f30, $f6 \n\t" "packushb $f16, $f16, $f18 \n\t" "packushb $f18, $f28, $f30 \n\t" "gssqc1 $f26, $f24, 0x0($13) \n\t" "gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t" : [pPix]"+&r"((unsigned char *)pPix) : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp) : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTC) { unsigned char tmp[256] __attribute__((aligned(32))); BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "lb $8, 0x2(%[pTC]) \n\t" "lb $9, 0x3(%[pTC]) \n\t" "move $11, $8 \n\t" "lb $8, 0x1(%[pTC]) \n\t" "lb %[pTC], 0x0(%[pTC]) \n\t" "move $12, %[pTC] \n\t" "and %[pTC], $9, 0xFFFF \n\t" "dmtc1 %[pTC], $f4 \n\t" "and %[pTC], $9, 0xFFFF \n\t" "dmtc1 %[pTC], $f8 \n\t" "move %[pTC], $11 \n\t" "and $9, %[pTC], 0xFFFF \n\t" "and %[pTC], %[pTC], 0xFFFF \n\t" "dmtc1 %[pTC], $f16 \n\t" "and %[pTC], $8, 0xFFFF \n\t" "dmtc1 %[pTC], $f20 \n\t" "dmtc1 $9, $f12 \n\t" "and %[pTC], $8, 0xFFFF \n\t" "dmtc1 %[pTC], $f24 \n\t" "move %[pTC], $12 \n\t" "and $9, %[pTC], 0xFFFF \n\t" "and %[pTC], %[pTC], 0xFFFF \n\t" "punpcklhw $f24, $f24, $f8 \n\t" "xor $f0, $f0, $f0 \n\t" "xor $f2, $f2, $f2 \n\t" "gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t" "dmtc1 $9, $f28 \n\t" "dmtc1 %[pTC], $f0 \n\t" "daddu %[pTC], %[iStride], %[iStride] \n\t" "dsubu $9, %[pPixCb], %[pTC] \n\t" "punpcklhw $f20, $f20, $f4 \n\t" "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t" "punpcklhw $f0, $f0, $f16 \n\t" "gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t" "punpcklhw $f28, $f28, $f12 \n\t" "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t" "punpcklhw $f0, $f0, $f24 \n\t" "gsldxc1 $f24, 0x0($9, $0) \n\t" "punpcklhw $f28, $f28, $f20 \n\t" "punpckhhw $f2, $f0, $f28 \n\t" "punpcklhw $f0, $f0, $f28 \n\t" "dsubu $9, %[pPixCr], %[pTC] \n\t" "psubh $f8, $f4, $f0 \n\t" "psubh $f10, $f6, $f2 \n\t" "gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t" "gsldxc1 $f8, 0x0($9, $0) \n\t" "mov.d $f26, $f8 \n\t" "dsubu %[pTC], %[pPixCb], %[iStride] \n\t" "gsldxc1 $f28, 0x0(%[pTC], $0) \n\t" "dsubu $9, %[pPixCr], %[iStride] \n\t" "gsldxc1 $f8, 0x0($9, $0) \n\t" "mov.d $f30, $f8 \n\t" "gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t" "mov.d $f14, $f8 \n\t" "gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t" "mov.d $f10, $f16 \n\t" "gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f8 \n\t" "punpcklhw $f16, $f8, $f8 \n\t" "dmtc1 %[iBeta], $f8 \n\t" "punpcklhw $f20, $f8, $f8 \n\t" "punpcklwd $f8, $f20, $f20 \n\t" "mov.d $f10, $f8 \n\t" "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t" "punpckhbh $f10, $f24, $f4 \n\t" "punpcklbh $f8, $f24, $f4 \n\t" "gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t" "punpcklwd $f16, $f16, $f16 \n\t" "mov.d $f18, $f16 \n\t" "gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t" "gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t" "mov.d $f8, $f28 \n\t" "mov.d $f10, $f30 \n\t" "punpcklbh $f28, $f30, $f6 \n\t" "punpckhbh $f30, $f30, $f6 \n\t" "punpckhbh $f22, $f20, $f4 \n\t" "punpcklbh $f20, $f20, $f4 \n\t" "gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t" "punpckhbh $f14, $f12, $f4 \n\t" "punpcklbh $f12, $f12, $f4 \n\t" "dli %[iBeta], 0x4 \n\t" "punpckhbh $f10, $f8, $f4 \n\t" "punpcklbh $f8, $f8, $f4 \n\t" "dmtc1 %[iBeta], $f24 \n\t" "punpcklhw $f28, $f24, $f24 \n\t" "punpcklwd $f24, $f28, $f28 \n\t" "mov.d $f26, $f24 \n\t" "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t" "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t" "psubh $f28, $f28, $f20 \n\t" "psubh $f30, $f30, $f22 \n\t" "pcmpgth $f24, $f0, $f4 \n\t" "pcmpgth $f26, $f2, $f6 \n\t" "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t" "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t" "psubh $f24, $f12, $f8 \n\t" "psubh $f26, $f14, $f10 \n\t" "dmfc1 %[iAlpha], $f12 \n\t" "dmfc1 %[iBeta], $f14 \n\t" "dli $10, 0x2 \n\t" "dmtc1 $10, $f12 \n\t" "dli $10, 0x3 \n\t" "dmtc1 $10, $f14 \n\t" "psllh $f24, $f24, $f12 \n\t" "psllh $f26, $f26, $f12 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "psrah $f24, $f24, $f14 \n\t" "psrah $f26, $f26, $f14 \n\t" "dmtc1 %[iAlpha], $f12 \n\t" "dmtc1 %[iBeta], $f14 \n\t" "pmaxsh $f4, $f4, $f24 \n\t" "pmaxsh $f6, $f6, $f26 \n\t" "gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t" "pminsh $f24, $f24, $f4 \n\t" "pminsh $f26, $f26, $f6 \n\t" "gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t" "psubh $f4, $f8, $f12 \n\t" "psubh $f6, $f10, $f14 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26) "pcmpgth $f24, $f16, $f4 \n\t" "pcmpgth $f26, $f18, $f6 \n\t" "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t" "psubh $f4, $f4, $f8 \n\t" "psubh $f6, $f6, $f10 \n\t" "dmfc1 %[iAlpha], $f8 \n\t" "dmfc1 %[iBeta], $f10 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10) "pcmpgth $f28, $f28, $f4 \n\t" "pcmpgth $f30, $f30, $f6 \n\t" "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t" "and $f24, $f24, $f28 \n\t" "and $f26, $f26, $f30 \n\t" "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "psubh $f20, $f20, $f12 \n\t" "psubh $f22, $f22, $f14 \n\t" WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10) "pcmpgth $f4, $f4, $f20 \n\t" "pcmpgth $f6, $f6, $f22 \n\t" "gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t" "gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t" "psubh $f20, $f20, $f8 \n\t" "psubh $f22, $f22, $f10 \n\t" "and $f24, $f24, $f4 \n\t" "and $f26, $f26, $f6 \n\t" "gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t" "and $f24, $f24, $f8 \n\t" "and $f26, $f26, $f10 \n\t" "gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t" "and $f4, $f4, $f24 \n\t" "and $f6, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t" "gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t" "gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t" "psubh $f24, $f24, $f4 \n\t" "psubh $f26, $f26, $f6 \n\t" "dli $10, 0x2 \n\t" "dmtc1 $10, $f8 \n\t" "psllh $f24, $f24, $f8 \n\t" "psllh $f26, $f26, $f8 \n\t" "paddh $f24, $f24, $f20 \n\t" "paddh $f26, $f26, $f22 \n\t" "dli $10, 0x3 \n\t" "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t" "paddh $f24, $f24, $f8 \n\t" "paddh $f26, $f26, $f10 \n\t" "dmtc1 $10, $f8 \n\t" "gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t" "psrah $f24, $f24, $f8 \n\t" "psrah $f26, $f26, $f8 \n\t" "pmaxsh $f20, $f20, $f24 \n\t" "pmaxsh $f22, $f22, $f26 \n\t" "pminsh $f0, $f0, $f20 \n\t" "pminsh $f2, $f2, $f22 \n\t" "gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t" "psubh $f24, $f4, $f20 \n\t" "psubh $f26, $f6, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10) "pcmpgth $f16, $f16, $f24 \n\t" "pcmpgth $f18, $f18, $f26 \n\t" "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t" "psubh $f24, $f24, $f4 \n\t" "psubh $f26, $f26, $f6 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10) "pcmpgth $f28, $f28, $f24 \n\t" "pcmpgth $f30, $f30, $f26 \n\t" "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t" "and $f16, $f16, $f28 \n\t" "and $f18, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "psubh $f24, $f24, $f20 \n\t" "psubh $f26, $f26, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10) "dmtc1 %[iAlpha], $f8 \n\t" "dmtc1 %[iBeta], $f10 \n\t" "pcmpgth $f28, $f28, $f24 \n\t" "pcmpgth $f30, $f30, $f26 \n\t" "and $f16, $f16, $f28 \n\t" "and $f18, $f18, $f30 \n\t" "gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t" "and $f16, $f16, $f24 \n\t" "and $f18, $f18, $f26 \n\t" "and $f0, $f0, $f16 \n\t" "and $f2, $f2, $f18 \n\t" "gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t" "paddh $f8, $f8, $f16 \n\t" "paddh $f10, $f10, $f18 \n\t" "paddh $f4, $f4, $f0 \n\t" "paddh $f6, $f6, $f2 \n\t" "packushb $f8, $f8, $f10 \n\t" "packushb $f10, $f4, $f6 \n\t" "gssdxc1 $f8, 0x0(%[pTC], $0) \n\t" "psubh $f12, $f12, $f16 \n\t" "psubh $f14, $f14, $f18 \n\t" "psubh $f20, $f20, $f0 \n\t" "psubh $f22, $f22, $f2 \n\t" "packushb $f12, $f12, $f14 \n\t" "packushb $f14, $f20, $f22 \n\t" "gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t" "gssdxc1 $f10, 0x0($9, $0) \n\t" "gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t" : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr) : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta), [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp) : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) { unsigned char tmp[128] __attribute__((aligned(32))); BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "daddu $8, %[iStride], %[iStride] \n\t" "dsubu $9, %[pPixCb], $8 \n\t" "gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t" "gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t" "gsldxc1 $f4, 0x0($9, $0) \n\t" "dsubu $9, %[pPixCr], $8 \n\t" "gsldxc1 $f8, 0x0($9, $0) \n\t" "mov.d $f6, $f8 \n\t" "dsubu $8, %[pPixCb], %[iStride] \n\t" "gsldxc1 $f8, 0x0($8, $0) \n\t" "dsubu $9, %[pPixCr], %[iStride] \n\t" "gsldxc1 $f12, 0x0($9, $0) \n\t" "mov.d $f10, $f12 \n\t" "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t" "mov.d $f14, $f16 \n\t" "gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t" "mov.d $f18, $f20 \n\t" "dmtc1 %[iAlpha], $f20 \n\t" "xor $f0, $f0, $f0 \n\t" "xor $f2, $f2, $f2 \n\t" "punpcklhw $f24, $f20, $f20 \n\t" "punpcklwd $f20, $f24, $f24 \n\t" "mov.d $f22, $f20 \n\t" "dmtc1 %[iBeta], $f24 \n\t" "punpcklhw $f28, $f24, $f24 \n\t" "punpcklwd $f24, $f28, $f28 \n\t" "mov.d $f26, $f24 \n\t" "mov.d $f28, $f4 \n\t" "punpcklbh $f4, $f6, $f2 \n\t" "punpckhbh $f6, $f6, $f2 \n\t" "punpckhbh $f30, $f28, $f0 \n\t" "punpcklbh $f28, $f28, $f0 \n\t" "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t" "gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t" "punpckhbh $f30, $f8, $f0 \n\t" "punpcklbh $f28, $f8, $f0 \n\t" "gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t" "punpckhbh $f30, $f12, $f0 \n\t" "punpcklbh $f28, $f12, $f0 \n\t" "punpcklbh $f12, $f14, $f2 \n\t" "punpckhbh $f14, $f14, $f2 \n\t" "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "mov.d $f28, $f16 \n\t" "punpcklbh $f16, $f18, $f2 \n\t" "punpckhbh $f18, $f18, $f2 \n\t" "punpcklbh $f8, $f10, $f2 \n\t" "punpckhbh $f10, $f10, $f2 \n\t" "punpckhbh $f30, $f28, $f0 \n\t" "punpcklbh $f28, $f28, $f0 \n\t" "gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t" "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t" "gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t" "psubh $f4, $f12, $f0 \n\t" "psubh $f6, $f14, $f2 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2) "gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t" "pcmpgth $f0, $f20, $f4 \n\t" "pcmpgth $f2, $f22, $f6 \n\t" "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t" "psubh $f4, $f4, $f12 \n\t" "psubh $f6, $f6, $f14 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18) "pcmpgth $f16, $f24, $f4 \n\t" "pcmpgth $f18, $f26, $f6 \n\t" "and $f0, $f0, $f16 \n\t" "and $f2, $f2, $f18 \n\t" "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t" "psubh $f4, $f28, $f16 \n\t" "psubh $f6, $f30, $f18 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18) "pcmpgth $f16, $f24, $f4 \n\t" "pcmpgth $f18, $f26, $f6 \n\t" "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t" "psubh $f4, $f8, $f4 \n\t" "psubh $f6, $f10, $f6 \n\t" "dmfc1 %[iAlpha], $f28 \n\t" "dmfc1 %[iBeta], $f30 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30) "pcmpgth $f20, $f20, $f4 \n\t" "pcmpgth $f22, $f22, $f6 \n\t" "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t" "and $f0, $f0, $f16 \n\t" "and $f2, $f2, $f18 \n\t" "psubh $f4, $f4, $f8 \n\t" "psubh $f6, $f6, $f10 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18) "pcmpgth $f16, $f24, $f4 \n\t" "pcmpgth $f18, $f26, $f6 \n\t" "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t" "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t" "psubh $f4, $f4, $f28 \n\t" "psubh $f6, $f6, $f30 \n\t" "and $f20, $f20, $f16 \n\t" "and $f22, $f22, $f18 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30) "dmtc1 %[iAlpha], $f28 \n\t" "dmtc1 %[iBeta], $f30 \n\t" "pcmpgth $f24, $f24, $f4 \n\t" "pcmpgth $f26, $f26, $f6 \n\t" "and $f20, $f20, $f24 \n\t" "and $f22, $f22, $f26 \n\t" "dli %[iBeta], 0x2 \n\t" "dmtc1 %[iBeta], $f4 \n\t" "punpcklhw $f16, $f4, $f4 \n\t" "punpcklwd $f4, $f16, $f16 \n\t" "mov.d $f6, $f4 \n\t" "gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t" "paddh $f24, $f16, $f16 \n\t" "paddh $f26, $f18, $f18 \n\t" "paddh $f24, $f24, $f12 \n\t" "paddh $f26, $f26, $f14 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t" "gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t" "paddh $f24, $f24, $f16 \n\t" "paddh $f26, $f26, $f18 \n\t" "dmtc1 %[iBeta], $f16 \n\t" "psrah $f24, $f24, $f16 \n\t" "psrah $f26, $f26, $f16 \n\t" "pandn $f16, $f0, $f12 \n\t" "pandn $f18, $f2, $f14 \n\t" "gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t" "and $f4, $f0, $f24 \n\t" "and $f6, $f2, $f26 \n\t" "or $f4, $f4, $f16 \n\t" "or $f6, $f6, $f18 \n\t" "paddh $f24, $f12, $f12 \n\t" "paddh $f26, $f14, $f14 \n\t" "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t" "paddh $f24, $f24, $f8 \n\t" "paddh $f26, $f26, $f10 \n\t" "gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t" "paddh $f24, $f24, $f16 \n\t" "paddh $f26, $f26, $f18 \n\t" "dmtc1 %[iBeta], $f16 \n\t" "paddh $f24, $f24, $f12 \n\t" "paddh $f26, $f26, $f14 \n\t" "psrah $f24, $f24, $f16 \n\t" "psrah $f26, $f26, $f16 \n\t" "and $f16, $f20, $f24 \n\t" "and $f18, $f22, $f26 \n\t" "pandn $f24, $f20, $f8 \n\t" "pandn $f26, $f22, $f10 \n\t" "or $f16, $f16, $f24 \n\t" "or $f18, $f18, $f26 \n\t" "packushb $f4, $f4, $f6 \n\t" "packushb $f6, $f16, $f18 \n\t" "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t" "paddh $f24, $f28, $f28 \n\t" "paddh $f26, $f30, $f30 \n\t" "paddh $f24, $f24, $f16 \n\t" "paddh $f26, $f26, $f18 \n\t" "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t" "paddh $f24, $f24, $f8 \n\t" "paddh $f26, $f26, $f10 \n\t" "dmtc1 %[iBeta], $f28 \n\t" "paddh $f24, $f24, $f12 \n\t" "paddh $f26, $f26, $f14 \n\t" "psrah $f24, $f24, $f28 \n\t" "psrah $f26, $f26, $f28 \n\t" "and $f8, $f0, $f24 \n\t" "and $f10, $f2, $f26 \n\t" "pandn $f0, $f0, $f16 \n\t" "pandn $f2, $f2, $f18 \n\t" "or $f8, $f8, $f0 \n\t" "or $f10, $f10, $f2 \n\t" "gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t" "paddh $f24, $f0, $f0 \n\t" "paddh $f26, $f2, $f2 \n\t" "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t" "paddh $f24, $f24, $f0 \n\t" "paddh $f26, $f26, $f2 \n\t" "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t" "paddh $f24, $f24, $f16 \n\t" "paddh $f26, $f26, $f18 \n\t" "paddh $f24, $f24, $f12 \n\t" "paddh $f26, $f26, $f14 \n\t" "gssdxc1 $f4, 0x0($8, $0) \n\t" "psrah $f24, $f24, $f28 \n\t" "psrah $f26, $f26, $f28 \n\t" "and $f16, $f20, $f24 \n\t" "and $f18, $f22, $f26 \n\t" "pandn $f20, $f20, $f0 \n\t" "pandn $f22, $f22, $f2 \n\t" "or $f16, $f16, $f20 \n\t" "or $f18, $f18, $f22 \n\t" "packushb $f8, $f8, $f10 \n\t" "packushb $f10, $f16, $f18 \n\t" "gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t" "gssdxc1 $f6, 0x0($9, $0) \n\t" "gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t" : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr) : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp) : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) { unsigned char tmp[256] __attribute__((aligned(32))); BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t" "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t" "move $9, %[pPixCb] \n\t" "move $10, %[pPixCr] \n\t" "dsll $11, %[iStride], 0x2 \n\t" "daddu %[pPixCb], %[pPixCb], $11 \n\t" "daddu %[pPixCr], %[pPixCr], $11 \n\t" "daddiu $11, %[tmp], 0x80 \n\t" "gsldlc1 $f0, 0x7($9) \n\t" "gsldrc1 $f0, 0x0($9) \n\t" "daddu $12, $9, %[iStride] \n\t" "gsldlc1 $f4, 0x7($12) \n\t" "gsldrc1 $f4, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsldlc1 $f8, 0x7($12) \n\t" "gsldrc1 $f8, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsldlc1 $f12, 0x7($12) \n\t" "gsldlc1 $f16, 0x7($10) \n\t" "gsldrc1 $f12, 0x0($12) \n\t" "gsldrc1 $f16, 0x0($10) \n\t" "daddu $12, $10, %[iStride] \n\t" "gsldlc1 $f20, 0x7($12) \n\t" "gsldrc1 $f20, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsldlc1 $f24, 0x7($12) \n\t" "gsldrc1 $f24, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsldlc1 $f28, 0x7($12) \n\t" "gsldrc1 $f28, 0x0($12) \n\t" "punpcklwd $f0, $f0, $f16 \n\t" "punpcklwd $f4, $f4, $f20 \n\t" "punpcklwd $f8, $f8, $f24 \n\t" "punpcklwd $f12, $f12, $f28 \n\t" "gsldlc1 $f16, 0x7(%[pPixCb]) \n\t" "gsldlc1 $f20, 0x7(%[pPixCr]) \n\t" "gsldrc1 $f16, 0x0(%[pPixCb]) \n\t" "gsldrc1 $f20, 0x0(%[pPixCr]) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f2, $f16 \n\t" "daddu $12, %[pPixCb], %[iStride] \n\t" "daddu $13, %[pPixCr], %[iStride] \n\t" "gsldlc1 $f16, 0x7($12) \n\t" "gsldlc1 $f20, 0x7($13) \n\t" "gsldrc1 $f16, 0x0($12) \n\t" "gsldrc1 $f20, 0x0($13) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f6, $f16 \n\t" "daddu $12, $12, %[iStride] \n\t" "daddu $13, $13, %[iStride] \n\t" "gsldlc1 $f16, 0x7($12) \n\t" "gsldlc1 $f20, 0x7($13) \n\t" "gsldrc1 $f16, 0x0($12) \n\t" "gsldrc1 $f20, 0x0($13) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f10, $f16 \n\t" "daddu $12, $12, %[iStride] \n\t" "daddu $13, $13, %[iStride] \n\t" "gsldlc1 $f16, 0x7($12) \n\t" "gsldlc1 $f20, 0x7($13) \n\t" "gsldrc1 $f16, 0x0($12) \n\t" "gsldrc1 $f20, 0x0($13) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f14, $f16 \n\t" "punpcklbh $f24, $f2, $f6 \n\t" "punpckhbh $f26, $f2, $f6 \n\t" "punpckhbh $f2, $f0, $f4 \n\t" "punpcklbh $f0, $f0, $f4 \n\t" "punpcklbh $f28, $f10, $f14 \n\t" "punpckhbh $f30, $f10, $f14 \n\t" "punpckhbh $f10, $f8, $f12 \n\t" "punpcklbh $f8, $f8, $f12 \n\t" "punpcklhw $f16, $f2, $f10 \n\t" "punpckhhw $f18, $f2, $f10 \n\t" "punpckhhw $f2, $f0, $f8 \n\t" "punpcklhw $f0, $f0, $f8 \n\t" "punpcklhw $f20, $f26, $f30 \n\t" "punpckhhw $f22, $f26, $f30 \n\t" "punpckhhw $f26, $f24, $f28 \n\t" "punpcklhw $f24, $f24, $f28 \n\t" "punpcklwd $f4, $f2, $f26 \n\t" "punpckhwd $f6, $f2, $f26 \n\t" "punpckhwd $f2, $f0, $f24 \n\t" "punpcklwd $f0, $f0, $f24 \n\t" "punpcklwd $f8, $f18, $f22 \n\t" "punpckhwd $f10, $f18, $f22 \n\t" "punpckhwd $f18, $f16, $f20 \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f20, $f2 \n\t" "mov.d $f22, $f18 \n\t" "mov.d $f2, $f16 \n\t" "mov.d $f24, $f6 \n\t" "mov.d $f26, $f10 \n\t" "mov.d $f6, $f8 \n\t" "gssqc1 $f2, $f0, 0x0($11) \n\t" "gssqc1 $f22, $f20, 0x10($11) \n\t" "gssqc1 $f6, $f4, 0x20($11) \n\t" "gssqc1 $f26, $f24, 0x30($11) \n\t" "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t" "gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t" "gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t" "gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t" "xor $f0, $f0, $f0 \n\t" "dmtc1 %[iAlpha], $f4 \n\t" "punpcklhw $f8, $f4, $f4 \n\t" "punpcklwd $f4, $f8, $f8 \n\t" "mov.d $f6, $f4 \n\t" "dmtc1 %[iBeta], $f8 \n\t" "punpcklhw $f12, $f8, $f8 \n\t" "punpcklwd $f8, $f12, $f12 \n\t" "mov.d $f10, $f8 \n\t" "mov.d $f12, $f24 \n\t" "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f0 \n\t" "punpckhbh $f26, $f26, $f0 \n\t" "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t" "punpckhbh $f30, $f28, $f0 \n\t" "punpcklbh $f28, $f28, $f0 \n\t" "punpckhbh $f18, $f16, $f0 \n\t" "punpcklbh $f16, $f16, $f0 \n\t" "punpckhbh $f22, $f20, $f0 \n\t" "punpcklbh $f20, $f20, $f0 \n\t" "punpckhbh $f14, $f12, $f0 \n\t" "punpcklbh $f12, $f12, $f0 \n\t" "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "psubh $f24, $f16, $f20 \n\t" "psubh $f26, $f18, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2) "pcmpgth $f0, $f4, $f24 \n\t" "pcmpgth $f2, $f6, $f26 \n\t" "psubh $f24, $f12, $f16 \n\t" "psubh $f26, $f14, $f18 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30) "pcmpgth $f28, $f8, $f24 \n\t" "pcmpgth $f30, $f10, $f26 \n\t" "gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t" "psubh $f24, $f24, $f20 \n\t" "psubh $f26, $f26, $f22 \n\t" "and $f0, $f0, $f28 \n\t" "and $f2, $f2, $f30 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30) "dmfc1 %[iAlpha], $f20 \n\t" "dmfc1 %[iBeta], $f22 \n\t" "pcmpgth $f28, $f8, $f24 \n\t" "pcmpgth $f30, $f10, $f26 \n\t" "gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t" "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t" "psubh $f24, $f24, $f20 \n\t" "psubh $f26, $f26, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22) "pcmpgth $f4, $f4, $f24 \n\t" "pcmpgth $f6, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t" "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t" "psubh $f24, $f24, $f20 \n\t" "psubh $f26, $f26, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22) "and $f0, $f0, $f28 \n\t" "and $f2, $f2, $f30 \n\t" "pcmpgth $f28, $f8, $f24 \n\t" "pcmpgth $f30, $f10, $f26 \n\t" "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t" "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t" "psubh $f24, $f24, $f20 \n\t" "psubh $f26, $f26, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22) "dli $8, 0x2 \n\t" "and $f4, $f4, $f28 \n\t" "and $f6, $f6, $f30 \n\t" "pcmpgth $f8, $f8, $f24 \n\t" "pcmpgth $f10, $f10, $f26 \n\t" "and $f4, $f4, $f8 \n\t" "and $f6, $f6, $f10 \n\t" "dmtc1 $8, $f8 \n\t" "punpcklhw $f24, $f8, $f8 \n\t" "punpcklwd $f8, $f24, $f24 \n\t" "mov.d $f10, $f8 \n\t" "gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t" "paddh $f8, $f12, $f12 \n\t" "paddh $f10, $f14, $f14 \n\t" "paddh $f8, $f8, $f16 \n\t" "paddh $f10, $f10, $f18 \n\t" "gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t" "paddh $f8, $f8, $f20 \n\t" "paddh $f10, $f10, $f22 \n\t" "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t" "paddh $f8, $f8, $f24 \n\t" "paddh $f10, $f10, $f26 \n\t" "dmtc1 $8, $f20 \n\t" "psrah $f8, $f8, $f20 \n\t" "psrah $f10, $f10, $f20 \n\t" "and $f24, $f0, $f8 \n\t" "and $f26, $f2, $f10 \n\t" "pandn $f8, $f0, $f16 \n\t" "pandn $f10, $f2, $f18 \n\t" "or $f24, $f24, $f8 \n\t" "or $f26, $f26, $f10 \n\t" "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t" "paddh $f28, $f8, $f8 \n\t" "paddh $f30, $f10, $f10 \n\t" "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t" "paddh $f28, $f28, $f20 \n\t" "paddh $f30, $f30, $f22 \n\t" "gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t" "paddh $f28, $f28, $f16 \n\t" "paddh $f30, $f30, $f18 \n\t" "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t" "paddh $f28, $f28, $f8 \n\t" "paddh $f30, $f30, $f10 \n\t" "pandn $f8, $f4, $f20 \n\t" "pandn $f10, $f6, $f22 \n\t" "dmtc1 $8, $f20 \n\t" "psrah $f28, $f28, $f20 \n\t" "psrah $f30, $f30, $f20 \n\t" "and $f16, $f4, $f28 \n\t" "and $f18, $f6, $f30 \n\t" "or $f16, $f16, $f8 \n\t" "or $f18, $f18, $f10 \n\t" "gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t" "packushb $f24, $f24, $f26 \n\t" "packushb $f26, $f16, $f18 \n\t" "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t" "paddh $f24, $f8, $f8 \n\t" "paddh $f26, $f10, $f10 \n\t" "dmtc1 %[iAlpha], $f20 \n\t" "dmtc1 %[iBeta], $f22 \n\t" "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t" "paddh $f24, $f24, $f20 \n\t" "paddh $f26, $f26, $f22 \n\t" "paddh $f24, $f24, $f12 \n\t" "paddh $f26, $f26, $f14 \n\t" "mov.d $f16, $f0 \n\t" "mov.d $f18, $f2 \n\t" "pandn $f0, $f0, $f20 \n\t" "pandn $f2, $f2, $f22 \n\t" "dmtc1 $8, $f20 \n\t" "paddh $f24, $f24, $f8 \n\t" "paddh $f26, $f26, $f10 \n\t" "psrah $f24, $f24, $f20 \n\t" "psrah $f26, $f26, $f20 \n\t" "and $f16, $f16, $f24 \n\t" "and $f18, $f18, $f26 \n\t" "or $f16, $f16, $f0 \n\t" "or $f18, $f18, $f2 \n\t" "gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t" "paddh $f20, $f0, $f0 \n\t" "paddh $f22, $f2, $f2 \n\t" "gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t" "paddh $f20, $f20, $f0 \n\t" "paddh $f22, $f22, $f2 \n\t" "gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t" "paddh $f20, $f20, $f12 \n\t" "paddh $f22, $f22, $f14 \n\t" "paddh $f20, $f20, $f8 \n\t" "paddh $f22, $f22, $f10 \n\t" "dmtc1 $8, $f8 \n\t" "psrah $f20, $f20, $f8 \n\t" "psrah $f22, $f22, $f8 \n\t" "and $f12, $f4, $f20 \n\t" "and $f14, $f6, $f22 \n\t" "pandn $f4, $f4, $f0 \n\t" "pandn $f6, $f6, $f2 \n\t" "or $f12, $f12, $f4 \n\t" "or $f14, $f14, $f6 \n\t" "packushb $f16, $f16, $f18 \n\t" "packushb $f18, $f12, $f14 \n\t" "gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t" "gslqc1 $f2, $f0, 0x0($11) \n\t" "gslqc1 $f6, $f4, 0x10($11) \n\t" "gslqc1 $f10, $f8, 0x20($11) \n\t" "gslqc1 $f14, $f12, 0x30($11) \n\t" "mov.d $f26, $f2 \n\t" "punpckhbh $f2, $f0, $f4 \n\t" "punpcklbh $f0, $f0, $f4 \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "mov.d $f30, $f10 \n\t" "punpckhbh $f10, $f8, $f12 \n\t" "punpcklbh $f8, $f8, $f12 \n\t" "punpcklbh $f28, $f30, $f14 \n\t" "punpckhbh $f30, $f30, $f14 \n\t" "punpcklhw $f16, $f2, $f10 \n\t" "punpckhhw $f18, $f2, $f10 \n\t" "punpcklhw $f20, $f26, $f30 \n\t" "punpckhhw $f22, $f26, $f30 \n\t" "punpckhhw $f2, $f0, $f8 \n\t" "punpcklhw $f0, $f0, $f8 \n\t" "punpckhhw $f26, $f24, $f28 \n\t" "punpcklhw $f24, $f24, $f28 \n\t" "punpcklwd $f4, $f2, $f26 \n\t" "punpckhwd $f6, $f2, $f26 \n\t" "punpcklwd $f8, $f18, $f22 \n\t" "punpckhwd $f10, $f18, $f22 \n\t" "punpckhwd $f2, $f0, $f24 \n\t" "punpcklwd $f0, $f0, $f24 \n\t" "punpckhwd $f18, $f16, $f20 \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f20, $f2 \n\t" "mov.d $f24, $f6 \n\t" "mov.d $f2, $f16 \n\t" "mov.d $f22, $f18 \n\t" "mov.d $f6, $f8 \n\t" "mov.d $f26, $f10 \n\t" "dli %[iAlpha], 0x20 \n\t" "dmtc1 %[iAlpha], $f8 \n\t" "gsswlc1 $f0, 0x3($9) \n\t" "gsswrc1 $f0, 0x0($9) \n\t" "daddu $12, $9, %[iStride] \n\t" "gsswlc1 $f20, 0x3($12) \n\t" "gsswrc1 $f20, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsswlc1 $f4, 0x3($12) \n\t" "gsswrc1 $f4, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsswlc1 $f24, 0x3($12) \n\t" "gsswrc1 $f24, 0x0($12) \n\t" "dsrl $f0, $f0, $f8 \n\t" "dsrl $f20, $f20, $f8 \n\t" "dsrl $f4, $f4, $f8 \n\t" "dsrl $f24, $f24, $f8 \n\t" "gsswlc1 $f0, 0x3($10) \n\t" "gsswrc1 $f0, 0x0($10) \n\t" "daddu $13, $10, %[iStride] \n\t" "daddu $8, $13, %[iStride] \n\t" "gsswlc1 $f20, 0x3($13) \n\t" "gsswrc1 $f20, 0x0($13) \n\t" "daddu $13, $8, %[iStride] \n\t" "gsswlc1 $f4, 0x3($8) \n\t" "gsswrc1 $f4, 0x0($8) \n\t" "gsswlc1 $f24, 0x3($13) \n\t" "gsswrc1 $f24, 0x0($13) \n\t" "gsswlc1 $f2, 0x3(%[pPixCb]) \n\t" "gsswrc1 $f2, 0x0(%[pPixCb]) \n\t" "daddu $12, %[pPixCb], %[iStride] \n\t" "gsswlc1 $f22, 0x3($12) \n\t" "gsswrc1 $f22, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsswlc1 $f6, 0x3($12) \n\t" "gsswrc1 $f6, 0x0($12) \n\t" "daddu $12, $12, %[iStride] \n\t" "gsswlc1 $f26, 0x3($12) \n\t" "gsswrc1 $f26, 0x0($12) \n\t" "dsrl $f2, $f2, $f8 \n\t" "dsrl $f22, $f22, $f8 \n\t" "dsrl $f6, $f6, $f8 \n\t" "dsrl $f26, $f26, $f8 \n\t" "gsswlc1 $f2, 0x3(%[pPixCr]) \n\t" "gsswrc1 $f2, 0x0(%[pPixCr]) \n\t" "daddu $13, %[pPixCr], %[iStride] \n\t" "daddu $8, $13, %[iStride] \n\t" "gsswlc1 $f22, 0x3($13) \n\t" "gsswrc1 $f22, 0x0($13) \n\t" "daddu $13, $8, %[iStride] \n\t" "gsswlc1 $f6, 0x3($8) \n\t" "gsswrc1 $f6, 0x0($8) \n\t" "gsswlc1 $f26, 0x3($13) \n\t" "gsswrc1 $f26, 0x0($13) \n\t" : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr) : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp) : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTC) { unsigned char tmp[320] __attribute__((aligned(32))); BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t" "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t" "daddu $8, %[pPixCb], %[iStride] \n\t" "gsldlc1 $f0, 0x7(%[pPixCb]) \n\t" "gsldlc1 $f4, 0x7($8) \n\t" "gsldrc1 $f0, 0x0(%[pPixCb]) \n\t" "gsldrc1 $f4, 0x0($8) \n\t" "daddu $9, $8, %[iStride] \n\t" "daddu $8, $9, %[iStride] \n\t" "gsldlc1 $f8, 0x7($9) \n\t" "gsldlc1 $f12, 0x7($8) \n\t" "gsldrc1 $f8, 0x0($9) \n\t" "gsldrc1 $f12, 0x0($8) \n\t" "daddu $9, $8, %[iStride] \n\t" "daddu $10, %[pPixCr], %[iStride] \n\t" "gsldlc1 $f16, 0x7(%[pPixCr]) \n\t" "gsldlc1 $f20, 0x7($10) \n\t" "gsldrc1 $f16, 0x0(%[pPixCr]) \n\t" "gsldrc1 $f20, 0x0($10) \n\t" "daddu $11, $10, %[iStride] \n\t" "daddu $10, $11, %[iStride] \n\t" "gsldlc1 $f24, 0x7($11) \n\t" "gsldlc1 $f28, 0x7($10) \n\t" "gsldrc1 $f24, 0x0($11) \n\t" "gsldrc1 $f28, 0x0($10) \n\t" "daddu $11, $10, %[iStride] \n\t" "punpcklwd $f0, $f0, $f16 \n\t" "punpcklwd $f4, $f4, $f20 \n\t" "punpcklwd $f8, $f8, $f24 \n\t" "punpcklwd $f12, $f12, $f28 \n\t" "gsldlc1 $f16, 0x7($9) \n\t" "gsldlc1 $f20, 0x7($11) \n\t" "gsldrc1 $f16, 0x0($9) \n\t" "gsldrc1 $f20, 0x0($11) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f2, $f16 \n\t" "daddu $8, $9, %[iStride] \n\t" "daddu $10, $11, %[iStride] \n\t" "gsldlc1 $f16, 0x7($8) \n\t" "gsldlc1 $f20, 0x7($10) \n\t" "gsldrc1 $f16, 0x0($8) \n\t" "gsldrc1 $f20, 0x0($10) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f6, $f16 \n\t" "daddu $9, $8, %[iStride] \n\t" "daddu $11, $10, %[iStride] \n\t" "gsldlc1 $f16, 0x7($9) \n\t" "gsldlc1 $f20, 0x7($11) \n\t" "gsldrc1 $f16, 0x0($9) \n\t" "gsldrc1 $f20, 0x0($11) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f10, $f16 \n\t" "daddu $8, $9, %[iStride] \n\t" "daddu $10, $11, %[iStride] \n\t" "gsldlc1 $f16, 0x7($8) \n\t" "gsldlc1 $f20, 0x7($10) \n\t" "gsldrc1 $f16, 0x0($8) \n\t" "gsldrc1 $f20, 0x0($10) \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f14, $f16 \n\t" "punpcklbh $f24, $f2, $f6 \n\t" "punpckhbh $f26, $f2, $f6 \n\t" "punpckhbh $f2, $f0, $f4 \n\t" "punpcklbh $f0, $f0, $f4 \n\t" "punpcklbh $f28, $f10, $f14 \n\t" "punpckhbh $f30, $f10, $f14 \n\t" "punpckhbh $f10, $f8, $f12 \n\t" "punpcklbh $f8, $f8, $f12 \n\t" "punpcklhw $f16, $f2, $f10 \n\t" "punpckhhw $f18, $f2, $f10 \n\t" "punpckhhw $f2, $f0, $f8 \n\t" "punpcklhw $f0, $f0, $f8 \n\t" "punpcklhw $f20, $f26, $f30 \n\t" "punpckhhw $f22, $f26, $f30 \n\t" "punpckhhw $f26, $f24, $f28 \n\t" "punpcklhw $f24, $f24, $f28 \n\t" "punpcklwd $f4, $f2, $f26 \n\t" "punpckhwd $f6, $f2, $f26 \n\t" "punpckhwd $f2, $f0, $f24 \n\t" "punpcklwd $f0, $f0, $f24 \n\t" "punpcklwd $f8, $f18, $f22 \n\t" "punpckhwd $f10, $f18, $f22 \n\t" "punpckhwd $f18, $f16, $f20 \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f20, $f2 \n\t" "mov.d $f22, $f18 \n\t" "mov.d $f2, $f16 \n\t" "mov.d $f24, $f6 \n\t" "mov.d $f26, $f10 \n\t" "mov.d $f6, $f8 \n\t" "daddiu $11, %[tmp], 0x70 \n\t" "gssqc1 $f2, $f0, 0x0($11) \n\t" "gssqc1 $f22, $f20, 0x10($11) \n\t" "gssqc1 $f6, $f4, 0x20($11) \n\t" "gssqc1 $f26, $f24, 0x30($11) \n\t" "lb $8, 0x3(%[pTC]) \n\t" "lb $9, 0x2(%[pTC]) \n\t" "lb $10, 0x1(%[pTC]) \n\t" "lb $11, 0x0(%[pTC]) \n\t" "and $12, $8, 0xFFFF \n\t" "dmtc1 $12, $f8 \n\t" "and $9, $9, 0xFFFF \n\t" "dmtc1 $9, $f12 \n\t" "mov.d $f16, $f12 \n\t" "and $9, $10, 0xFFFF \n\t" "dmtc1 $9, $f20 \n\t" "xor $f0, $f0, $f0 \n\t" "mov.d $f24, $f20 \n\t" "and $9, $11, 0xFFFF \n\t" "punpcklhw $f24, $f24, $f8 \n\t" "mov.d $f4, $f8 \n\t" "dmtc1 $9, $f28 \n\t" "mov.d $f0, $f28 \n\t" "punpcklhw $f28, $f28, $f12 \n\t" "punpcklhw $f20, $f20, $f4 \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "punpcklhw $f28, $f28, $f20 \n\t" "gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t" "punpcklhw $f0, $f0, $f16 \n\t" "punpcklhw $f0, $f0, $f24 \n\t" "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t" "punpckhhw $f2, $f0, $f28 \n\t" "punpcklhw $f0, $f0, $f28 \n\t" "gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t" "psubh $f8, $f4, $f0 \n\t" "psubh $f10, $f6, $f2 \n\t" "gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f8 \n\t" "punpcklhw $f12, $f8, $f8 \n\t" "punpcklwd $f16, $f12, $f12 \n\t" "mov.d $f18, $f16 \n\t" "dmtc1 %[iBeta], $f8 \n\t" "punpcklhw $f12, $f8, $f8 \n\t" "punpcklwd $f8, $f12, $f12 \n\t" "mov.d $f10, $f8 \n\t" "gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t" "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t" "punpckhbh $f10, $f24, $f4 \n\t" "punpcklbh $f8, $f24, $f4 \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t" "gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t" "punpcklbh $f8, $f28, $f4 \n\t" "punpckhbh $f10, $f28, $f4 \n\t" "punpcklbh $f28, $f30, $f6 \n\t" "punpckhbh $f30, $f30, $f6 \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "punpckhbh $f14, $f12, $f4 \n\t" "punpcklbh $f12, $f12, $f4 \n\t" "punpckhbh $f22, $f20, $f4 \n\t" "punpcklbh $f20, $f20, $f4 \n\t" "gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t" "gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t" "punpcklbh $f24, $f26, $f6 \n\t" "punpckhbh $f26, $f26, $f6 \n\t" "dli $13, 0x4 \n\t" "gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t" "dmtc1 $13, $f24 \n\t" "punpcklhw $f28, $f24, $f24 \n\t" "punpcklwd $f24, $f28, $f28 \n\t" "mov.d $f26, $f24 \n\t" "dli $12, 0x2 \n\t" "dli $13, 0x3 \n\t" "gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t" "dmfc1 %[iAlpha], $f0 \n\t" "dmfc1 %[iBeta], $f2 \n\t" "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t" "gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t" "psubh $f28, $f28, $f20 \n\t" "psubh $f30, $f30, $f22 \n\t" "pcmpgth $f24, $f0, $f4 \n\t" "pcmpgth $f26, $f2, $f6 \n\t" "dmtc1 $12, $f0 \n\t" "dmtc1 $13, $f2 \n\t" "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t" "gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t" "psubh $f24, $f12, $f8 \n\t" "psubh $f26, $f14, $f10 \n\t" "psllh $f24, $f24, $f0 \n\t" "psllh $f26, $f26, $f0 \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t" "paddh $f24, $f24, $f28 \n\t" "paddh $f26, $f26, $f30 \n\t" "psrah $f24, $f24, $f2 \n\t" "psrah $f26, $f26, $f2 \n\t" "pmaxsh $f4, $f4, $f24 \n\t" "pmaxsh $f6, $f6, $f26 \n\t" "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t" "pminsh $f24, $f24, $f4 \n\t" "pminsh $f26, $f26, $f6 \n\t" "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t" "psubh $f4, $f8, $f12 \n\t" "psubh $f6, $f10, $f14 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2) "pcmpgth $f24, $f16, $f4 \n\t" "pcmpgth $f26, $f18, $f6 \n\t" "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t" "psubh $f4, $f4, $f8 \n\t" "psubh $f6, $f6, $f10 \n\t" WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2) "pcmpgth $f28, $f28, $f4 \n\t" "pcmpgth $f30, $f30, $f6 \n\t" "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t" "and $f24, $f24, $f28 \n\t" "and $f26, $f26, $f30 \n\t" "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "psubh $f20, $f20, $f12 \n\t" "psubh $f22, $f22, $f14 \n\t" WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2) "pcmpgth $f4, $f4, $f20 \n\t" "pcmpgth $f6, $f6, $f22 \n\t" "gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t" "gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t" "psubh $f20, $f20, $f0 \n\t" "psubh $f22, $f22, $f2 \n\t" "and $f24, $f24, $f4 \n\t" "and $f26, $f26, $f6 \n\t" "gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t" "and $f24, $f24, $f0 \n\t" "and $f26, $f26, $f2 \n\t" "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t" "and $f4, $f4, $f24 \n\t" "and $f6, $f6, $f26 \n\t" "gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t" "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t" "gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t" "dmtc1 $12, $f0 \n\t" "psubh $f24, $f24, $f4 \n\t" "psubh $f26, $f26, $f6 \n\t" "psllh $f24, $f24, $f0 \n\t" "psllh $f26, $f26, $f0 \n\t" "paddh $f24, $f24, $f20 \n\t" "paddh $f26, $f26, $f22 \n\t" "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t" "paddh $f24, $f24, $f0 \n\t" "paddh $f26, $f26, $f2 \n\t" "dmtc1 %[iBeta], $f2 \n\t" "dmtc1 $13, $f0 \n\t" "gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t" "psrah $f24, $f24, $f0 \n\t" "psrah $f26, $f26, $f0 \n\t" "dmtc1 %[iAlpha], $f0 \n\t" "pmaxsh $f20, $f20, $f24 \n\t" "pmaxsh $f22, $f22, $f26 \n\t" "pminsh $f0, $f0, $f20 \n\t" "pminsh $f2, $f2, $f22 \n\t" "dmfc1 %[iAlpha], $f0 \n\t" "dmfc1 %[iBeta], $f2 \n\t" "gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t" "psubh $f24, $f4, $f20 \n\t" "psubh $f26, $f6, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2) "pcmpgth $f16, $f16, $f24 \n\t" "pcmpgth $f18, $f18, $f26 \n\t" "gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t" "psubh $f24, $f24, $f4 \n\t" "psubh $f26, $f26, $f6 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2) "pcmpgth $f28, $f28, $f24 \n\t" "pcmpgth $f30, $f30, $f26 \n\t" "gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t" "and $f16, $f16, $f28 \n\t" "and $f18, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t" "psubh $f24, $f24, $f20 \n\t" "psubh $f26, $f26, $f22 \n\t" WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2) "pcmpgth $f28, $f28, $f24 \n\t" "pcmpgth $f30, $f30, $f26 \n\t" "and $f16, $f16, $f28 \n\t" "and $f18, $f18, $f30 \n\t" "gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t" "dmtc1 %[iAlpha], $f0 \n\t" "dmtc1 %[iBeta], $f2 \n\t" "and $f16, $f16, $f28 \n\t" "and $f18, $f18, $f30 \n\t" "and $f0, $f0, $f16 \n\t" "and $f2, $f2, $f18 \n\t" "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t" "paddh $f8, $f8, $f16 \n\t" "paddh $f10, $f10, $f18 \n\t" "paddh $f4, $f4, $f0 \n\t" "paddh $f6, $f6, $f2 \n\t" "psubh $f12, $f12, $f16 \n\t" "psubh $f14, $f14, $f18 \n\t" "psubh $f20, $f20, $f0 \n\t" "psubh $f22, $f22, $f2 \n\t" "packushb $f8, $f8, $f10 \n\t" "packushb $f10, $f4, $f6 \n\t" "packushb $f12, $f12, $f14 \n\t" "packushb $f14, $f20, $f22 \n\t" "gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t" "gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t" "daddiu $11, %[tmp], 0x70 \n\t" "gslqc1 $f2, $f0, 0x0($11) \n\t" "gslqc1 $f6, $f4, 0x10($11) \n\t" "gslqc1 $f10, $f8, 0x20($11) \n\t" "gslqc1 $f14, $f12, 0x30($11) \n\t" "punpcklbh $f24, $f2, $f6 \n\t" "punpckhbh $f26, $f2, $f6 \n\t" "punpckhbh $f2, $f0, $f4 \n\t" "punpcklbh $f0, $f0, $f4 \n\t" "punpcklbh $f28, $f10, $f14 \n\t" "punpckhbh $f30, $f10, $f14 \n\t" "punpckhbh $f10, $f8, $f12 \n\t" "punpcklbh $f8, $f8, $f12 \n\t" "punpcklhw $f16, $f2, $f10 \n\t" "punpckhhw $f18, $f2, $f10 \n\t" "punpckhhw $f2, $f0, $f8 \n\t" "punpcklhw $f0, $f0, $f8 \n\t" "punpcklhw $f20, $f26, $f30 \n\t" "punpckhhw $f22, $f26, $f30 \n\t" "punpckhhw $f26, $f24, $f28 \n\t" "punpcklhw $f24, $f24, $f28 \n\t" "punpcklwd $f4, $f2, $f26 \n\t" "punpckhwd $f6, $f2, $f26 \n\t" "punpckhwd $f2, $f0, $f24 \n\t" "punpcklwd $f0, $f0, $f24 \n\t" "punpcklwd $f8, $f18, $f22 \n\t" "punpckhwd $f10, $f18, $f22 \n\t" "punpckhwd $f18, $f16, $f20 \n\t" "punpcklwd $f16, $f16, $f20 \n\t" "mov.d $f20, $f2 \n\t" "mov.d $f22, $f18 \n\t" "mov.d $f2, $f16 \n\t" "mov.d $f24, $f6 \n\t" "mov.d $f26, $f10 \n\t" "mov.d $f6, $f8 \n\t" "dli %[iAlpha], 0x20 \n\t" "daddu $8, %[pPixCb], %[iStride] \n\t" "gsswlc1 $f0, 0x3(%[pPixCb]) \n\t" "gsswlc1 $f20, 0x3($8) \n\t" "gsswrc1 $f0, 0x0(%[pPixCb]) \n\t" "gsswrc1 $f20, 0x0($8) \n\t" "daddu $9, $8, %[iStride] \n\t" "daddu $8, $9, %[iStride] \n\t" "gsswlc1 $f4, 0x3($9) \n\t" "gsswlc1 $f24, 0x3($8) \n\t" "gsswrc1 $f4, 0x0($9) \n\t" "gsswrc1 $f24, 0x0($8) \n\t" "daddu $9, $8, %[iStride] \n\t" "dmtc1 %[iAlpha], $f8 \n\t" "dsrl $f0, $f0, $f8 \n\t" "dsrl $f20, $f20, $f8 \n\t" "dsrl $f4, $f4, $f8 \n\t" "dsrl $f24, $f24, $f8 \n\t" "daddu $10, %[pPixCr], %[iStride] \n\t" "gsswlc1 $f0, 0x3(%[pPixCr]) \n\t" "gsswlc1 $f20, 0x3($10) \n\t" "gsswrc1 $f0, 0x0(%[pPixCr]) \n\t" "gsswrc1 $f20, 0x0($10) \n\t" "daddu $11, $10, %[iStride] \n\t" "daddu $10, $11, %[iStride] \n\t" "gsswlc1 $f4, 0x3($11) \n\t" "gsswlc1 $f24, 0x3($10) \n\t" "gsswrc1 $f4, 0x0($11) \n\t" "gsswrc1 $f24, 0x0($10) \n\t" "daddu $11, $10, %[iStride] \n\t" "daddu $8, $9, %[iStride] \n\t" "gsswlc1 $f2, 0x3($9) \n\t" "gsswlc1 $f22, 0x3($8) \n\t" "gsswrc1 $f2, 0x0($9) \n\t" "gsswrc1 $f22, 0x0($8) \n\t" "daddu $9, $8, %[iStride] \n\t" "daddu $8, $9, %[iStride] \n\t" "gsswlc1 $f6, 0x3($9) \n\t" "gsswlc1 $f26, 0x3($8) \n\t" "gsswrc1 $f6, 0x0($9) \n\t" "gsswrc1 $f26, 0x0($8) \n\t" "dsrl $f2, $f2, $f8 \n\t" "dsrl $f22, $f22, $f8 \n\t" "dsrl $f6, $f6, $f8 \n\t" "dsrl $f26, $f26, $f8 \n\t" "daddu $10, $11, %[iStride] \n\t" "gsswlc1 $f2, 0x3($11) \n\t" "gsswlc1 $f22, 0x3($10) \n\t" "gsswrc1 $f2, 0x0($11) \n\t" "gsswrc1 $f22, 0x0($10) \n\t" "daddu $11, $10, %[iStride] \n\t" "daddu $10, $11, %[iStride] \n\t" "gsswlc1 $f6, 0x3($11) \n\t" "gsswlc1 $f26, 0x3($10) \n\t" "gsswrc1 $f6, 0x0($11) \n\t" "gsswrc1 $f26, 0x0($10) \n\t" : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr) : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC) : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; } void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) { __asm__ volatile( ".set arch=loongson3a \n\t" "gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t" "gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t" "gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t" "gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t" "gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t" "gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t" "pcmpeqh $f8, $f8, $f8 \n\t" "dli $8, 0xF \n\t" "dmtc1 $8, $f6 \n\t" "psrlh $f8, $f8, $f6 \n\t" "packushb $f8, $f8, $f8 \n\t" "pminub $f0, $f0, $f8 \n\t" "pminub $f2, $f2, $f8 \n\t" "pminub $f4, $f4, $f8 \n\t" "gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t" "gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t" "gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t" "gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t" "gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t" "gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t" : : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount) : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8" ); }