ref: 77406e6a66ae9c58b5ef3e7a37ac2714b8a0db9f
dir: /codec/common/mips/deblock_mmi.c/
/*!
* \copy
* Copyright (c) 2009-2018, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* \file deblock_mmi.c
*
* \brief Loongson optimize
*
* \date 20/07/2018 Created
*
*************************************************************************************
*/
#include <stdint.h>
#include "asmdefs_mmi.h"
void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
int32_t iBeta, int8_t *pTC) {
unsigned char tmp[512] __attribute__((aligned(32)));
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"dsll $8, %[iStride], 0x1 \n\t"
"daddu $8, $8, %[iStride] \n\t"
"dsubu $14, %[pPix], $8 \n\t"
"dsll $8, %[iStride], 0x1 \n\t"
"dsubu $9, %[pPix], $8 \n\t"
"dmtc1 %[iAlpha], $f0 \n\t"
"dsubu $13, %[pPix], %[iStride] \n\t"
"daddu %[iStride], %[iStride], %[pPix] \n\t"
"daddu $12, $8, %[pPix] \n\t"
"punpcklhw $f0, $f0, $f0 \n\t"
"lb $8, 0x0(%[pTC]) \n\t"
"punpcklwd $f0, $f0, $f0 \n\t"
"mov.d $f2, $f0 \n\t"
"gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t"
"dmtc1 %[iBeta], $f0 \n\t"
"lb %[iAlpha], 0x1(%[pTC]) \n\t"
"dli %[iBeta], 0xFFFF \n\t"
"punpcklhw $f0, $f0, $f0 \n\t"
"and $10, %[iAlpha], %[iBeta] \n\t"
"punpcklwd $f0, $f0, $f0 \n\t"
"mov.d $f2, $f0 \n\t"
"and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
"dmtc1 $10, $f4 \n\t"
"mov.d $f8, $f4 \n\t"
"dmtc1 %[iAlpha], $f16 \n\t"
"and %[iAlpha], $8, %[iBeta] \n\t"
"dmtc1 %[iAlpha], $f20 \n\t"
"mov.d $f24, $f20 \n\t"
"mov.d $f28, $f20 \n\t"
"gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f0 \n\t"
"lb %[iAlpha], 0x3(%[pTC]) \n\t"
"lb %[pTC], 0x2(%[pTC]) \n\t"
"dmtc1 $10, $f12 \n\t"
"punpcklhw $f0, $f0, $f16 \n\t"
"and $8, %[iAlpha], %[iBeta] \n\t"
"punpcklhw $f24, $f24, $f8 \n\t"
"punpcklhw $f20, $f20, $f4 \n\t"
"punpcklhw $f0, $f0, $f24 \n\t"
"punpcklhw $f28, $f28, $f12 \n\t"
"punpcklhw $f28, $f28, $f20 \n\t"
"punpckhhw $f2, $f0, $f28 \n\t"
"punpcklhw $f0, $f0, $f28 \n\t"
"gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
"dmtc1 $8, $f0 \n\t"
"and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
"mov.d $f8, $f0 \n\t"
"dmtc1 %[iAlpha], $f16 \n\t"
"and %[iAlpha], %[pTC], %[iBeta] \n\t"
"dmtc1 $8, $f12 \n\t"
"dmtc1 %[iAlpha], $f20 \n\t"
"punpcklhw $f20, $f20, $f0 \n\t"
"xor $f0, $f0, $f0 \n\t"
"dmtc1 %[iAlpha], $f24 \n\t"
"and %[pTC], %[pTC], %[iBeta] \n\t"
"punpcklhw $f24, $f24, $f8 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"dmtc1 %[pTC], $f4 \n\t"
"gslqc1 $f10, $f8, 0x0($9) \n\t"
"punpckhbh $f10, $f8, $f0 \n\t"
"punpcklbh $f8, $f8, $f0 \n\t"
"dli %[iAlpha], 0x4 \n\t"
"seh %[pTC], %[iAlpha] \n\t"
"punpcklhw $f28, $f28, $f12 \n\t"
"punpcklhw $f28, $f28, $f20 \n\t"
"gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
"gslqc1 $f14, $f12, 0x0($13) \n\t"
"gsldxc1 $f2, 0x0($12, $0) \n\t"
"punpckhbh $f22, $f20, $f0 \n\t"
"punpcklbh $f20, $f20, $f0 \n\t"
"gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
"punpckhbh $f22, $f2, $f0 \n\t"
"punpcklbh $f20, $f2, $f0 \n\t"
"gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
"punpcklhw $f4, $f4, $f16 \n\t"
"gslqc1 $f18, $f16, 0x0($14) \n\t"
"punpcklhw $f4, $f4, $f24 \n\t"
"gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
"punpckhhw $f6, $f4, $f28 \n\t"
"punpcklhw $f4, $f4, $f28 \n\t"
"punpckhbh $f26, $f24, $f0 \n\t"
"punpcklbh $f24, $f24, $f0 \n\t"
"punpckhbh $f14, $f12, $f0 \n\t"
"punpcklbh $f12, $f12, $f0 \n\t"
"punpckhbh $f18, $f16, $f0 \n\t"
"punpcklbh $f16, $f16, $f0 \n\t"
"psubh $f28, $f12, $f16 \n\t"
"psubh $f30, $f14, $f18 \n\t"
"gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
"gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
"pcmpgth $f20, $f16, $f28 \n\t"
"pcmpgth $f22, $f18, $f30 \n\t"
"gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
"psubh $f28, $f24, $f0 \n\t"
"psubh $f30, $f26, $f2 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
"pcmpgth $f20, $f16, $f28 \n\t"
"pcmpgth $f22, $f18, $f30 \n\t"
"gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
"pavgh $f20, $f12, $f24 \n\t"
"pavgh $f22, $f14, $f26 \n\t"
"gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t"
"psubh $f20, $f20, $f28 \n\t"
"psubh $f22, $f22, $f30 \n\t"
"psubh $f20, $f20, $f0 \n\t"
"psubh $f22, $f22, $f2 \n\t"
"gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
"psubh $f20, $f24, $f12 \n\t"
"psubh $f22, $f26, $f14 \n\t"
"gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t"
"psubh $f24, $f24, $f0 \n\t"
"psubh $f26, $f26, $f2 \n\t"
"gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
"gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
"pcmpgth $f20, $f20, $f28 \n\t"
"pcmpgth $f22, $f22, $f30 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
"pcmpgth $f28, $f16, $f24 \n\t"
"pcmpgth $f30, $f18, $f26 \n\t"
"xor $f0, $f0, $f0 \n\t"
"and $f20, $f20, $f28 \n\t"
"and $f22, $f22, $f30 \n\t"
"psubh $f24, $f12, $f8 \n\t"
"psubh $f26, $f14, $f10 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
"pcmpgth $f28, $f16, $f24 \n\t"
"pcmpgth $f30, $f18, $f26 \n\t"
"gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
"and $f20, $f20, $f28 \n\t"
"and $f22, $f22, $f30 \n\t"
"pcmpgth $f28, $f24, $f0 \n\t"
"pcmpgth $f30, $f26, $f0 \n\t"
"pcmpeqh $f24, $f24, $f0 \n\t"
"pcmpeqh $f26, $f26, $f0 \n\t"
"or $f28, $f28, $f24 \n\t"
"or $f30, $f30, $f26 \n\t"
"and $f20, $f20, $f28 \n\t"
"and $f22, $f22, $f30 \n\t"
"gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
"dmtc1 %[pTC], $f20 \n\t"
"punpckhhw $f26, $f20, $f20 \n\t"
"punpcklhw $f24, $f20, $f20 \n\t"
"punpcklwd $f20, $f24, $f24 \n\t"
"mov.d $f22, $f20 \n\t"
"gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
"psubh $f24, $f0, $f20 \n\t"
"dli $11, 0x2 \n\t"
"psubh $f26, $f0, $f22 \n\t"
"dmtc1 $11, $f28 \n\t"
"gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
"psllh $f20, $f20, $f28 \n\t"
"psllh $f22, $f22, $f28 \n\t"
"psubh $f28, $f8, $f0 \n\t"
"psubh $f30, $f10, $f2 \n\t"
"paddh $f28, $f28, $f20 \n\t"
"paddh $f30, $f30, $f22 \n\t"
"gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
"paddh $f28, $f28, $f20 \n\t"
"paddh $f30, $f30, $f22 \n\t"
"dli $11, 0x3 \n\t"
"dmtc1 $11, $f20 \n\t"
"psrah $f28, $f28, $f20 \n\t"
"psrah $f30, $f30, $f20 \n\t"
"gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
"pmaxsh $f24, $f24, $f28 \n\t"
"pmaxsh $f26, $f26, $f30 \n\t"
"gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t"
"pminsh $f20, $f20, $f24 \n\t"
"pminsh $f22, $f22, $f26 \n\t"
"and $f20, $f20, $f0 \n\t"
"and $f22, $f22, $f2 \n\t"
"gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
"gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t"
"xor $f0, $f0, $f0 \n\t"
"gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t"
"psubh $f20, $f0, $f24 \n\t"
"psubh $f22, $f0, $f26 \n\t"
"gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
"mov.d $f24, $f20 \n\t"
"mov.d $f26, $f22 \n\t"
"gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
"paddh $f20, $f20, $f28 \n\t"
"paddh $f22, $f22, $f30 \n\t"
"paddh $f28, $f8, $f8 \n\t"
"paddh $f30, $f10, $f10 \n\t"
"psubh $f20, $f20, $f28 \n\t"
"psubh $f22, $f22, $f30 \n\t"
"dli $11, 0x1 \n\t"
"dmtc1 $11, $f28 \n\t"
"psrah $f20, $f20, $f28 \n\t"
"psrah $f22, $f22, $f28 \n\t"
"pmaxsh $f24, $f24, $f20 \n\t"
"pmaxsh $f26, $f26, $f22 \n\t"
"gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
"pminsh $f20, $f20, $f24 \n\t"
"pminsh $f22, $f22, $f26 \n\t"
"gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
"and $f20, $f20, $f24 \n\t"
"and $f22, $f22, $f26 \n\t"
"and $f20, $f20, $f28 \n\t"
"and $f22, $f22, $f30 \n\t"
"gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t"
"gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
"paddh $f20, $f20, $f28 \n\t"
"paddh $f22, $f22, $f30 \n\t"
"paddh $f28, $f24, $f24 \n\t"
"paddh $f30, $f26, $f26 \n\t"
"gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
"dli $11, 0x1 \n\t"
"psubh $f20, $f20, $f28 \n\t"
"dmtc1 $11, $f28 \n\t"
"psubh $f22, $f22, $f30 \n\t"
"psrah $f20, $f20, $f28 \n\t"
"psrah $f22, $f22, $f28 \n\t"
"gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t"
"pmaxsh $f24, $f24, $f20 \n\t"
"pmaxsh $f26, $f26, $f22 \n\t"
"gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
"pminsh $f20, $f20, $f24 \n\t"
"pminsh $f22, $f22, $f26 \n\t"
"gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
"and $f20, $f20, $f24 \n\t"
"and $f22, $f22, $f26 \n\t"
"gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t"
"and $f20, $f20, $f24 \n\t"
"and $f22, $f22, $f26 \n\t"
"gslqc1 $f26, $f24, 0x0($9) \n\t"
"punpcklbh $f28, $f30, $f0 \n\t"
"punpckhbh $f30, $f30, $f0 \n\t"
"gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 0x0($12) \n\t"
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 0x0($14) \n\t"
"gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x0($13) \n\t"
"punpcklbh $f28, $f30, $f0 \n\t"
"punpckhbh $f30, $f30, $f0 \n\t"
"punpcklbh $f20, $f22, $f0 \n\t"
"punpckhbh $f22, $f22, $f0 \n\t"
"gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
"psubh $f28, $f28, $f20 \n\t"
"psubh $f30, $f30, $f22 \n\t"
"gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"pcmpgth $f20, $f16, $f28 \n\t"
"pcmpgth $f22, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
"gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
"psubh $f28, $f24, $f28 \n\t"
"psubh $f30, $f26, $f30 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
"pcmpgth $f20, $f16, $f28 \n\t"
"pcmpgth $f22, $f18, $f30 \n\t"
"gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t"
"pavgh $f20, $f20, $f24 \n\t"
"pavgh $f22, $f22, $f26 \n\t"
"gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t"
"psubh $f20, $f4, $f20 \n\t"
"psubh $f22, $f6, $f22 \n\t"
"psubh $f20, $f20, $f28 \n\t"
"psubh $f22, $f22, $f30 \n\t"
"gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
"psubh $f20, $f24, $f20 \n\t"
"psubh $f22, $f26, $f22 \n\t"
"psubh $f24, $f24, $f28 \n\t"
"psubh $f26, $f26, $f30 \n\t"
"gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
"mov.d $f28, $f20 \n\t"
"mov.d $f30, $f22 \n\t"
WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
"gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
"pcmpgth $f20, $f20, $f28 \n\t"
"pcmpgth $f22, $f22, $f30 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
"pcmpgth $f28, $f16, $f24 \n\t"
"pcmpgth $f30, $f18, $f26 \n\t"
"gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
"and $f20, $f20, $f28 \n\t"
"and $f22, $f22, $f30 \n\t"
"gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
"psubh $f28, $f28, $f24 \n\t"
"psubh $f30, $f30, $f26 \n\t"
"gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
"psubh $f24, $f24, $f0 \n\t"
"psubh $f26, $f26, $f2 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
"pcmpgth $f16, $f16, $f28 \n\t"
"pcmpgth $f18, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t"
"and $f20, $f20, $f16 \n\t"
"and $f22, $f22, $f18 \n\t"
"xor $f0, $f0, $f0 \n\t"
"paddh $f8, $f8, $f28 \n\t"
"paddh $f10, $f10, $f30 \n\t"
"pcmpgth $f16, $f4, $f0 \n\t"
"pcmpgth $f18, $f6, $f0 \n\t"
"pcmpeqh $f28, $f4, $f0 \n\t"
"pcmpeqh $f30, $f6, $f0 \n\t"
"or $f16, $f16, $f28 \n\t"
"or $f18, $f18, $f30 \n\t"
"and $f20, $f20, $f16 \n\t"
"and $f22, $f22, $f18 \n\t"
"gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t"
"gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
"dli $11, 0x2 \n\t"
"psubh $f28, $f0, $f16 \n\t"
"psubh $f30, $f0, $f18 \n\t"
"psubh $f2, $f0, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"dmfc1 %[iAlpha], $f28 \n\t"
"dmtc1 $11, $f28 \n\t"
"psllh $f20, $f20, $f28 \n\t"
"psllh $f22, $f22, $f28 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"paddh $f24, $f24, $f20 \n\t"
"paddh $f26, $f26, $f22 \n\t"
"gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
"paddh $f24, $f24, $f20 \n\t"
"paddh $f26, $f26, $f22 \n\t"
"gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
"dli $11, 0x3 \n\t"
"gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
"dmfc1 %[iAlpha], $f0 \n\t"
"dmtc1 $11, $f0 \n\t"
"psrah $f24, $f24, $f0 \n\t"
"psrah $f26, $f26, $f0 \n\t"
"dmtc1 %[iAlpha], $f0 \n\t"
"pmaxsh $f28, $f28, $f24 \n\t"
"pmaxsh $f30, $f30, $f26 \n\t"
"pminsh $f16, $f16, $f28 \n\t"
"pminsh $f18, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t"
"and $f16, $f16, $f28 \n\t"
"and $f18, $f18, $f30 \n\t"
"mov.d $f24, $f0 \n\t"
"mov.d $f26, $f2 \n\t"
"gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
"paddh $f0, $f0, $f28 \n\t"
"paddh $f2, $f2, $f30 \n\t"
"gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t"
"dli $11, 0x1 \n\t"
"paddh $f16, $f16, $f16 \n\t"
"paddh $f18, $f18, $f18 \n\t"
"psubh $f0, $f0, $f16 \n\t"
"psubh $f2, $f2, $f18 \n\t"
"dmtc1 $11, $f28 \n\t"
"gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t"
"psrah $f0, $f0, $f28 \n\t"
"psrah $f2, $f2, $f28 \n\t"
"pmaxsh $f24, $f24, $f0 \n\t"
"pmaxsh $f26, $f26, $f2 \n\t"
"gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
"pminsh $f28, $f4, $f24 \n\t"
"pminsh $f30, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
"and $f28, $f28, $f24 \n\t"
"and $f30, $f30, $f26 \n\t"
"dmfc1 %[iAlpha], $f24 \n\t"
"dmfc1 %[iBeta], $f26 \n\t"
"gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t"
"and $f28, $f28, $f24 \n\t"
"and $f30, $f30, $f26 \n\t"
"paddh $f20, $f20, $f28 \n\t"
"paddh $f22, $f22, $f30 \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f20, $f22 \n\t"
"gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
"paddh $f0, $f0, $f20 \n\t"
"paddh $f2, $f2, $f22 \n\t"
"paddh $f12, $f12, $f16 \n\t"
"paddh $f14, $f14, $f18 \n\t"
"packushb $f12, $f12, $f14 \n\t"
"packushb $f14, $f0, $f2 \n\t"
"gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t"
"psubh $f0, $f0, $f16 \n\t"
"psubh $f2, $f2, $f18 \n\t"
"gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t"
"psubh $f16, $f16, $f20 \n\t"
"gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t"
"psubh $f18, $f18, $f22 \n\t"
"gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
"paddh $f20, $f20, $f24 \n\t"
"paddh $f22, $f22, $f26 \n\t"
"gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t"
"packushb $f0, $f0, $f2 \n\t"
"packushb $f2, $f16, $f18 \n\t"
"gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t"
"paddh $f16, $f16, $f24 \n\t"
"paddh $f18, $f18, $f26 \n\t"
"gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
"mov.d $f28, $f0 \n\t"
"mov.d $f30, $f2 \n\t"
"paddh $f0, $f0, $f0 \n\t"
"paddh $f2, $f2, $f2 \n\t"
"dmtc1 %[iAlpha], $f24 \n\t"
"dmtc1 %[iBeta], $f26 \n\t"
"psubh $f16, $f16, $f0 \n\t"
"psubh $f18, $f18, $f2 \n\t"
"dli $11, 0x1 \n\t"
"gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
"gssqc1 $f10, $f8, 0x0($9) \n\t"
"dmtc1 $11, $f8 \n\t"
"psrah $f16, $f16, $f8 \n\t"
"psrah $f18, $f18, $f8 \n\t"
"pmaxsh $f0, $f0, $f16 \n\t"
"pmaxsh $f2, $f2, $f18 \n\t"
"pminsh $f4, $f4, $f0 \n\t"
"pminsh $f6, $f6, $f2 \n\t"
"gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t"
"and $f4, $f4, $f24 \n\t"
"and $f6, $f6, $f26 \n\t"
"and $f4, $f4, $f8 \n\t"
"and $f6, $f6, $f10 \n\t"
"gssqc1 $f14, $f12, 0x0($13) \n\t"
"paddh $f28, $f28, $f4 \n\t"
"paddh $f30, $f30, $f6 \n\t"
"packushb $f20, $f20, $f22 \n\t"
"packushb $f22, $f28, $f30 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
"gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
: [pPix]"+&r"((unsigned char *)pPix)
: [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
[pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
"$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
"$f22", "$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
uint8_t *pDst) {
BACKUP_REG;
__asm__ volatile(
".set arch=loongson3a \n\t"
"dsll $8, %[iStride], 0x3 \n\t"
"daddu $8, $8, %[pPixY] \n\t"
"daddu $9, %[pPixY], %[iStride] \n\t"
"daddu $10, $8, %[iStride] \n\t"
"gsldlc1 $f0, 0x7(%[pPixY]) \n\t"
"gsldlc1 $f2, 0x7($8) \n\t"
"gsldlc1 $f4, 0x7($9) \n\t"
"gsldlc1 $f6, 0x7($10) \n\t"
"gsldrc1 $f0, 0x0(%[pPixY]) \n\t"
"gsldrc1 $f2, 0x0($8) \n\t"
"gsldrc1 $f4, 0x0($9) \n\t"
"gsldrc1 $f6, 0x0($10) \n\t"
"daddu %[pPixY], $9, %[iStride] \n\t"
"daddu $8, $10, %[iStride] \n\t"
"daddu $9, %[pPixY], %[iStride] \n\t"
"daddu $10, $8, %[iStride] \n\t"
"gsldlc1 $f8, 0x7(%[pPixY]) \n\t"
"gsldlc1 $f10, 0x7($8) \n\t"
"gsldlc1 $f12, 0x7($9) \n\t"
"gsldlc1 $f14, 0x7($10) \n\t"
"gsldrc1 $f8, 0x0(%[pPixY]) \n\t"
"gsldrc1 $f10, 0x0($8) \n\t"
"gsldrc1 $f12, 0x0($9) \n\t"
"gsldrc1 $f14, 0x0($10) \n\t"
"daddu %[pPixY], $9, %[iStride] \n\t"
"daddu $8, $10, %[iStride] \n\t"
"daddu $9, %[pPixY], %[iStride] \n\t"
"daddu $10, $8, %[iStride] \n\t"
"gsldlc1 $f16, 0x7(%[pPixY]) \n\t"
"gsldlc1 $f18, 0x7($8) \n\t"
"gsldlc1 $f20, 0x7($9) \n\t"
"gsldlc1 $f22, 0x7($10) \n\t"
"gsldrc1 $f16, 0x0(%[pPixY]) \n\t"
"gsldrc1 $f18, 0x0($8) \n\t"
"gsldrc1 $f20, 0x0($9) \n\t"
"gsldrc1 $f22, 0x0($10) \n\t"
"daddu %[pPixY], $9, %[iStride] \n\t"
"daddu $8, $10, %[iStride] \n\t"
"daddu $9, %[pPixY], %[iStride] \n\t"
"daddu $10, $8, %[iStride] \n\t"
"gsldlc1 $f24, 0x7(%[pPixY]) \n\t"
"gsldlc1 $f26, 0x7($8) \n\t"
"gsldlc1 $f28, 0x7($9) \n\t"
"gsldlc1 $f30, 0x7($10) \n\t"
"gsldrc1 $f24, 0x0(%[pPixY]) \n\t"
"gsldrc1 $f26, 0x0($8) \n\t"
"gsldrc1 $f28, 0x0($9) \n\t"
"gsldrc1 $f30, 0x0($10) \n\t"
MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
$f14, $f16, $f18, $f20, $f22, $f24,
$f26, $f28, $f30, $9, $10)
"gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t"
"gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t"
"gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t"
"gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t"
"gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t"
"gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t"
"gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t"
"gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t"
: [pPixY] "+&r"((unsigned char *)pPixY)
: [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
: "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
"$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
"$f30"
);
RECOVER_REG;
}
void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
uint8_t *pSrc) {
BACKUP_REG;
__asm__ volatile(
".set arch=loongson3a \n\t"
"gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t"
"gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t"
"gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t"
"gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t"
"gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t"
"gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t"
"gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t"
"gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t"
MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
$f14, $f16, $f18, $f20, $f22, $f24,
$f26, $f28, $f30, $9, $10)
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f16, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f8, 0x7($8) \n\t"
"gssdrc1 $f16, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f8, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f12, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f28, 0x7($8) \n\t"
"gssdrc1 $f12, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f28, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f20, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f4, 0x7($8) \n\t"
"gssdrc1 $f20, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f4, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f24, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f0, 0x7($8) \n\t"
"gssdrc1 $f24, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f0, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f18, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f10, 0x7($8) \n\t"
"gssdrc1 $f18, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f10, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f14, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f30, 0x7($8) \n\t"
"gssdrc1 $f14, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f30, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f22, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f6, 0x7($8) \n\t"
"gssdrc1 $f22, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f6, 0x0($8) \n\t"
"daddu %[pPixY], $8, %[iStride] \n\t"
"daddu $8, %[pPixY], %[iStride] \n\t"
"gssdlc1 $f26, 0x7(%[pPixY]) \n\t"
"gssdlc1 $f2, 0x7($8) \n\t"
"gssdrc1 $f26, 0x0(%[pPixY]) \n\t"
"gssdrc1 $f2, 0x0($8) \n\t"
: [pPixY] "+&r"((unsigned char *)pPixY)
: [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
: "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
"$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
"$f30"
);
RECOVER_REG;
}
void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
int32_t iBeta) {
unsigned char tmp[720] __attribute__((aligned(32)));
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"dsll $11, %[iStride], 0x2 \n\t"
"xor $f8, $f8, $f8 \n\t"
"daddu $14, %[iStride], %[pPix] \n\t"
"dsubu $8, %[pPix], $11 \n\t"
"gslqc1 $f14, $f12, 0x0($8) \n\t"
"gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t"
"daddu $9, %[iStride], %[iStride] \n\t"
"daddu $10, $9, %[iStride] \n\t"
"move $12, $9 \n\t"
"dsubu $8, %[pPix], $9 \n\t"
"gslqc1 $f6, $f4, 0x0($8) \n\t"
"dsubu $9, %[pPix], %[iStride] \n\t"
"gslqc1 $f18, $f16, 0x0($9) \n\t"
"daddu $13, %[iStride], %[pPix] \n\t"
"move %[iStride], $12 \n\t"
"daddu $15, $12, %[pPix] \n\t"
"daddu $12, %[pPix], $10 \n\t"
"dsubu $11, %[pPix], $10 \n\t"
"gslqc1 $f26, $f24, 0x0($11) \n\t"
"daddu %[iStride], %[iStride], %[pPix] \n\t"
"dmtc1 %[iAlpha], $f0 \n\t"
"punpcklhw $f28, $f0, $f0 \n\t"
"punpcklwd $f0, $f28, $f28 \n\t"
"mov.d $f2, $f0 \n\t"
"gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
"dmtc1 %[iBeta], $f0 \n\t"
"gsldxc1 $f10, 0x0($15, $0) \n\t"
"punpcklhw $f28, $f0, $f0 \n\t"
"punpcklwd $f0, $f28, $f28 \n\t"
"punpckhbh $f30, $f10, $f8 \n\t"
"mov.d $f2, $f0 \n\t"
"punpcklbh $f28, $f10, $f8 \n\t"
"gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
"gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
"mov.d $f0, $f4 \n\t"
"gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t"
"gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
"mov.d $f4, $f16 \n\t"
"punpckhbh $f22, $f20, $f8 \n\t"
"punpcklbh $f20, $f20, $f8 \n\t"
"punpckhbh $f6, $f4, $f8 \n\t"
"punpcklbh $f4, $f4, $f8 \n\t"
"psubh $f28, $f20, $f4 \n\t"
"psubh $f30, $f22, $f6 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
"gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
"punpckhbh $f2, $f0, $f8 \n\t"
"punpcklbh $f0, $f0, $f8 \n\t"
"gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 0x0($14) \n\t"
"gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
"psubh $f28, $f4, $f0 \n\t"
"psubh $f30, $f6, $f2 \n\t"
"gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
"punpckhbh $f18, $f16, $f8 \n\t"
"punpcklbh $f16, $f16, $f8 \n\t"
"pcmpgth $f0, $f0, $f28 \n\t"
"pcmpgth $f2, $f2, $f30 \n\t"
"gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t"
"psubh $f28, $f20, $f16 \n\t"
"psubh $f30, $f22, $f18 \n\t"
"gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
"punpckhbh $f26, $f24, $f8 \n\t"
"punpcklbh $f24, $f24, $f8 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
"gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t"
"gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t"
"pcmpgth $f16, $f16, $f28 \n\t"
"pcmpgth $f18, $f18, $f30 \n\t"
"and $f0, $f0, $f16 \n\t"
"and $f2, $f2, $f18 \n\t"
"gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
"dli %[iAlpha], 0x2 \n\t"
"dli %[iBeta], 0x2 \n\t"
"pcmpgth $f16, $f16, $f28 \n\t"
"pcmpgth $f18, $f18, $f30 \n\t"
"and $f0, $f0, $f16 \n\t"
"and $f2, $f2, $f18 \n\t"
"dmtc1 %[iAlpha], $f16 \n\t"
"dmtc1 %[iBeta], $f10 \n\t"
"gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
"punpcklhw $f28, $f16, $f16 \n\t"
"psrah $f16, $f0, $f10 \n\t"
"psrah $f18, $f2, $f10 \n\t"
"punpcklwd $f28, $f28, $f28 \n\t"
"mov.d $f30, $f28 \n\t"
"gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
"paddh $f16, $f16, $f28 \n\t"
"paddh $f18, $f18, $f30 \n\t"
"gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t"
"pcmpgth $f16, $f16, $f8 \n\t"
"pcmpgth $f18, $f18, $f10 \n\t"
"gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
"gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
"psubh $f28, $f4, $f24 \n\t"
"psubh $f30, $f6, $f26 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
"pcmpgth $f16, $f16, $f28 \n\t"
"pcmpgth $f18, $f18, $f30 \n\t"
"gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t"
"and $f16, $f16, $f8 \n\t"
"and $f18, $f18, $f10 \n\t"
"gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
"psubh $f28, $f20, $f0 \n\t"
"psubh $f30, $f22, $f2 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
"pcmpgth $f16, $f16, $f28 \n\t"
"pcmpgth $f18, $f18, $f30 \n\t"
"and $f16, $f16, $f8 \n\t"
"and $f18, $f18, $f10 \n\t"
"gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
"xor $f8, $f8, $f8 \n\t"
"pandn $f16, $f16, $f24 \n\t"
"dli %[iAlpha], 0x4 \n\t"
"pandn $f18, $f18, $f26 \n\t"
"gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f16 \n\t"
"punpcklhw $f28, $f16, $f16 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"punpckhbh $f18, $f12, $f8 \n\t"
"dmtc1 %[iAlpha], $f30 \n\t"
"punpcklbh $f16, $f12, $f8 \n\t"
"psllh $f16, $f16, $f30 \n\t"
"psllh $f18, $f18, $f30 \n\t"
"paddh $f16, $f16, $f24 \n\t"
"paddh $f18, $f18, $f26 \n\t"
"gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
"paddh $f16, $f16, $f24 \n\t"
"paddh $f18, $f18, $f26 \n\t"
"paddh $f16, $f16, $f24 \n\t"
"paddh $f18, $f18, $f26 \n\t"
"paddh $f16, $f16, $f0 \n\t"
"paddh $f18, $f18, $f2 \n\t"
"gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
"punpcklwd $f28, $f28, $f28 \n\t"
"mov.d $f30, $f28 \n\t"
"paddh $f16, $f16, $f4 \n\t"
"paddh $f18, $f18, $f6 \n\t"
"gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
"paddh $f16, $f16, $f20 \n\t"
"paddh $f18, $f18, $f22 \n\t"
"paddh $f16, $f16, $f28 \n\t"
"paddh $f18, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t"
"pandn $f24, $f24, $f28 \n\t"
"pandn $f26, $f26, $f30 \n\t"
"gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x0($12) \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"punpckhbh $f26, $f24, $f8 \n\t"
"punpcklbh $f24, $f24, $f8 \n\t"
"psllh $f24, $f24, $f10 \n\t"
"psllh $f26, $f26, $f10 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"paddh $f24, $f24, $f0 \n\t"
"paddh $f26, $f26, $f2 \n\t"
"dli %[iAlpha], 0x3 \n\t"
"gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
"paddh $f24, $f24, $f20 \n\t"
"paddh $f26, $f26, $f22 \n\t"
"paddh $f24, $f24, $f4 \n\t"
"paddh $f26, $f26, $f6 \n\t"
"paddh $f24, $f24, $f0 \n\t"
"paddh $f26, $f26, $f2 \n\t"
"gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"psrah $f24, $f24, $f10 \n\t"
"psrah $f26, $f26, $f10 \n\t"
"and $f24, $f24, $f0 \n\t"
"and $f26, $f26, $f2 \n\t"
"gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
"pandn $f24, $f24, $f28 \n\t"
"pandn $f26, $f26, $f30 \n\t"
"gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"psrah $f16, $f16, $f10 \n\t"
"psrah $f18, $f18, $f10 \n\t"
"and $f16, $f16, $f0 \n\t"
"and $f18, $f18, $f2 \n\t"
"gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t"
"paddh $f28, $f4, $f20 \n\t"
"paddh $f30, $f6, $f22 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"paddh $f24, $f24, $f0 \n\t"
"paddh $f26, $f26, $f2 \n\t"
"gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
"dli %[iAlpha], 0x2 \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"paddh $f20, $f20, $f4 \n\t"
"paddh $f22, $f22, $f6 \n\t"
"psrah $f24, $f24, $f10 \n\t"
"psrah $f26, $f26, $f10 \n\t"
"and $f28, $f28, $f24 \n\t"
"and $f30, $f30, $f26 \n\t"
"gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t"
"gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
"pandn $f28, $f28, $f24 \n\t"
"pandn $f30, $f30, $f26 \n\t"
"gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
"paddh $f28, $f28, $f24 \n\t"
"paddh $f30, $f30, $f26 \n\t"
"paddh $f28, $f28, $f20 \n\t"
"paddh $f30, $f30, $f22 \n\t"
"paddh $f28, $f28, $f8 \n\t"
"paddh $f30, $f30, $f10 \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t"
"psrah $f28, $f28, $f10 \n\t"
"psrah $f30, $f30, $f10 \n\t"
"and $f20, $f20, $f28 \n\t"
"and $f22, $f22, $f30 \n\t"
"gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
"paddh $f28, $f20, $f20 \n\t"
"paddh $f30, $f22, $f22 \n\t"
"paddh $f20, $f4, $f24 \n\t"
"paddh $f22, $f6, $f26 \n\t"
"paddh $f24, $f24, $f0 \n\t"
"paddh $f26, $f26, $f2 \n\t"
"paddh $f28, $f28, $f20 \n\t"
"paddh $f30, $f30, $f22 \n\t"
"paddh $f28, $f28, $f8 \n\t"
"paddh $f30, $f30, $f10 \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t"
"psrah $f28, $f28, $f10 \n\t"
"psrah $f30, $f30, $f10 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"pandn $f20, $f20, $f28 \n\t"
"pandn $f22, $f22, $f30 \n\t"
"gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
"paddh $f28, $f28, $f4 \n\t"
"paddh $f30, $f30, $f6 \n\t"
"gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t"
"paddh $f28, $f28, $f4 \n\t"
"paddh $f30, $f30, $f6 \n\t"
"gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t"
"psllh $f28, $f28, $f10 \n\t"
"psllh $f30, $f30, $f10 \n\t"
"dli %[iAlpha], 0x3 \n\t"
"paddh $f28, $f28, $f24 \n\t"
"paddh $f30, $f30, $f26 \n\t"
"paddh $f20, $f20, $f28 \n\t"
"paddh $f22, $f22, $f30 \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"dli %[iAlpha], 0x2 \n\t"
"gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
"psrah $f20, $f20, $f10 \n\t"
"psrah $f22, $f22, $f10 \n\t"
"and $f4, $f4, $f20 \n\t"
"and $f6, $f6, $f22 \n\t"
"gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
"gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t"
"gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t"
"paddh $f24, $f4, $f4 \n\t"
"paddh $f26, $f6, $f6 \n\t"
"paddh $f4, $f4, $f8 \n\t"
"paddh $f6, $f6, $f10 \n\t"
"gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t"
"paddh $f28, $f28, $f20 \n\t"
"paddh $f30, $f30, $f22 \n\t"
"paddh $f4, $f4, $f8 \n\t"
"paddh $f6, $f6, $f10 \n\t"
"gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"paddh $f20, $f20, $f8 \n\t"
"paddh $f22, $f22, $f10 \n\t"
"gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
"paddh $f24, $f24, $f8 \n\t"
"dmtc1 %[iAlpha], $f8 \n\t"
"paddh $f26, $f26, $f10 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"psrah $f24, $f24, $f8 \n\t"
"psrah $f26, $f26, $f8 \n\t"
"psllh $f4, $f4, $f10 \n\t"
"psllh $f6, $f6, $f10 \n\t"
"paddh $f4, $f4, $f20 \n\t"
"paddh $f6, $f6, $f22 \n\t"
"dli %[iAlpha], 0x3 \n\t"
"gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t"
"pandn $f28, $f28, $f24 \n\t"
"pandn $f30, $f30, $f26 \n\t"
"gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"paddh $f24, $f24, $f4 \n\t"
"paddh $f26, $f26, $f6 \n\t"
"gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t"
"psrah $f24, $f24, $f10 \n\t"
"psrah $f26, $f26, $f10 \n\t"
"and $f4, $f4, $f24 \n\t"
"and $f6, $f6, $f26 \n\t"
"xor $f8, $f8, $f8 \n\t"
"gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t"
"gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t"
"gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
"punpcklbh $f4, $f6, $f8 \n\t"
"punpckhbh $f6, $f6, $f8 \n\t"
"gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t"
"gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t"
"punpcklbh $f4, $f6, $f8 \n\t"
"punpckhbh $f6, $f6, $f8 \n\t"
"punpcklbh $f24, $f26, $f8 \n\t"
"punpckhbh $f26, $f26, $f8 \n\t"
"gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
"punpcklbh $f20, $f22, $f8 \n\t"
"punpckhbh $f22, $f22, $f8 \n\t"
"gslqc1 $f30, $f28, 0x0($14) \n\t"
"gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t"
"gsldxc1 $f0, 0x8($15, $0) \n\t"
"punpcklbh $f28, $f30, $f8 \n\t"
"punpckhbh $f30, $f30, $f8 \n\t"
"gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
"punpcklbh $f28, $f0, $f8 \n\t"
"punpckhbh $f30, $f0, $f8 \n\t"
"gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
"gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
"psubh $f28, $f24, $f4 \n\t"
"psubh $f30, $f26, $f6 \n\t"
"psubh $f24, $f24, $f8 \n\t"
"psubh $f26, $f26, $f10 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t"
"gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
"or $f16, $f16, $f8 \n\t"
"or $f18, $f18, $f10 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
"gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
"psubh $f28, $f4, $f28 \n\t"
"psubh $f30, $f6, $f30 \n\t"
"gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"pcmpgth $f4, $f0, $f28 \n\t"
"pcmpgth $f6, $f2, $f30 \n\t"
"pcmpgth $f28, $f0, $f24 \n\t"
"pcmpgth $f30, $f2, $f26 \n\t"
"gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t"
"and $f4, $f4, $f28 \n\t"
"and $f6, $f6, $f30 \n\t"
"gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
"pcmpgth $f24, $f24, $f28 \n\t"
"pcmpgth $f26, $f26, $f30 \n\t"
"and $f4, $f4, $f24 \n\t"
"and $f6, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t"
"pcmpgth $f24, $f24, $f28 \n\t"
"pcmpgth $f26, $f26, $f30 \n\t"
"xor $f8, $f8, $f8 \n\t"
"gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
"punpcklbh $f12, $f14, $f8 \n\t"
"punpckhbh $f14, $f14, $f8 \n\t"
"gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
"psubh $f28, $f28, $f20 \n\t"
"psubh $f30, $f30, $f22 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"pcmpgth $f24, $f24, $f28 \n\t"
"pcmpgth $f26, $f26, $f30 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
"and $f24, $f24, $f8 \n\t"
"and $f26, $f26, $f10 \n\t"
"gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
"psubh $f28, $f28, $f8 \n\t"
"psubh $f30, $f30, $f10 \n\t"
"dmtc1 %[iAlpha], $f10 \n\t"
"psllh $f12, $f12, $f10 \n\t"
"psllh $f14, $f14, $f10 \n\t"
"gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
"paddh $f12, $f12, $f20 \n\t"
"paddh $f14, $f14, $f22 \n\t"
"paddh $f12, $f12, $f20 \n\t"
"paddh $f14, $f14, $f22 \n\t"
"paddh $f12, $f12, $f20 \n\t"
"paddh $f14, $f14, $f22 \n\t"
"paddh $f12, $f12, $f8 \n\t"
"paddh $f14, $f14, $f10 \n\t"
"gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
"paddh $f12, $f12, $f8 \n\t"
"paddh $f14, $f14, $f10 \n\t"
WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
"pcmpgth $f24, $f24, $f28 \n\t"
"pcmpgth $f26, $f26, $f30 \n\t"
"and $f24, $f24, $f0 \n\t"
"and $f26, $f26, $f2 \n\t"
"gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
"dli %[iAlpha], 0x3 \n\t"
"gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t"
"and $f24, $f0, $f16 \n\t"
"and $f26, $f2, $f18 \n\t"
"pandn $f16, $f0, $f28 \n\t"
"pandn $f18, $f2, $f30 \n\t"
"or $f24, $f24, $f16 \n\t"
"or $f26, $f26, $f18 \n\t"
"gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
"paddh $f12, $f12, $f16 \n\t"
"paddh $f14, $f14, $f18 \n\t"
"gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
"paddh $f12, $f12, $f28 \n\t"
"paddh $f14, $f14, $f30 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"psrah $f12, $f12, $f28 \n\t"
"psrah $f14, $f14, $f28 \n\t"
"and $f12, $f12, $f8 \n\t"
"and $f14, $f14, $f10 \n\t"
"pandn $f8, $f8, $f20 \n\t"
"pandn $f10, $f10, $f22 \n\t"
"or $f12, $f12, $f8 \n\t"
"or $f14, $f14, $f10 \n\t"
"and $f28, $f4, $f12 \n\t"
"and $f30, $f6, $f14 \n\t"
"gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
"or $f12, $f12, $f8 \n\t"
"or $f14, $f14, $f10 \n\t"
"pandn $f8, $f4, $f20 \n\t"
"pandn $f10, $f6, $f22 \n\t"
"or $f28, $f28, $f8 \n\t"
"or $f30, $f30, $f10 \n\t"
"dli %[iAlpha], 0x2 \n\t"
"and $f8, $f0, $f12 \n\t"
"and $f10, $f2, $f14 \n\t"
"gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t"
"pandn $f12, $f0, $f12 \n\t"
"pandn $f14, $f2, $f14 \n\t"
"or $f8, $f8, $f12 \n\t"
"or $f10, $f10, $f14 \n\t"
"packushb $f24, $f24, $f26 \n\t"
"packushb $f26, $f28, $f30 \n\t"
"gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
"paddh $f8, $f20, $f8 \n\t"
"paddh $f10, $f22, $f10 \n\t"
"gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
"paddh $f28, $f28, $f16 \n\t"
"paddh $f30, $f30, $f18 \n\t"
"paddh $f8, $f8, $f28 \n\t"
"paddh $f10, $f10, $f30 \n\t"
"gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
"paddh $f8, $f8, $f28 \n\t"
"paddh $f10, $f10, $f30 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"psrah $f8, $f8, $f28 \n\t"
"psrah $f10, $f10, $f28 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t"
"and $f24, $f24, $f8 \n\t"
"and $f26, $f26, $f10 \n\t"
"gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
"pandn $f28, $f28, $f8 \n\t"
"pandn $f30, $f30, $f10 \n\t"
"or $f24, $f24, $f28 \n\t"
"or $f26, $f26, $f30 \n\t"
"and $f12, $f4, $f24 \n\t"
"and $f14, $f6, $f26 \n\t"
"pandn $f24, $f4, $f8 \n\t"
"pandn $f26, $f6, $f10 \n\t"
"gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
"paddh $f8, $f8, $f28 \n\t"
"paddh $f10, $f10, $f30 \n\t"
"paddh $f8, $f8, $f16 \n\t"
"paddh $f10, $f10, $f18 \n\t"
"or $f12, $f12, $f24 \n\t"
"or $f14, $f14, $f26 \n\t"
"gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"packushb $f24, $f24, $f26 \n\t"
"packushb $f26, $f12, $f14 \n\t"
"psllh $f8, $f8, $f28 \n\t"
"psllh $f10, $f10, $f28 \n\t"
"gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t"
"or $f24, $f24, $f28 \n\t"
"or $f26, $f26, $f30 \n\t"
"dli %[iAlpha], 0x3 \n\t"
"and $f12, $f0, $f24 \n\t"
"and $f14, $f2, $f26 \n\t"
"gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t"
"pandn $f24, $f0, $f24 \n\t"
"pandn $f26, $f2, $f26 \n\t"
"or $f12, $f12, $f24 \n\t"
"or $f14, $f14, $f26 \n\t"
"gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
"gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t"
"gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
"paddh $f12, $f12, $f28 \n\t"
"paddh $f14, $f14, $f30 \n\t"
"paddh $f8, $f8, $f12 \n\t"
"paddh $f10, $f10, $f14 \n\t"
"gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t"
"paddh $f20, $f20, $f8 \n\t"
"paddh $f22, $f22, $f10 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
"psrah $f20, $f20, $f28 \n\t"
"psrah $f22, $f22, $f28 \n\t"
"and $f24, $f24, $f20 \n\t"
"and $f26, $f26, $f22 \n\t"
"gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
"paddh $f8, $f8, $f20 \n\t"
"paddh $f10, $f10, $f22 \n\t"
"gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
"dli %[iAlpha], 0x2 \n\t"
"paddh $f20, $f20, $f28 \n\t"
"paddh $f22, $f22, $f30 \n\t"
"paddh $f16, $f12, $f12 \n\t"
"paddh $f18, $f14, $f14 \n\t"
"paddh $f16, $f16, $f8 \n\t"
"paddh $f18, $f18, $f10 \n\t"
"gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
"paddh $f16, $f16, $f28 \n\t"
"paddh $f18, $f18, $f30 \n\t"
"gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
"paddh $f12, $f12, $f28 \n\t"
"paddh $f14, $f14, $f30 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"psrah $f16, $f16, $f28 \n\t"
"psrah $f18, $f18, $f28 \n\t"
"pandn $f8, $f8, $f16 \n\t"
"pandn $f10, $f10, $f18 \n\t"
"or $f24, $f24, $f8 \n\t"
"or $f26, $f26, $f10 \n\t"
"and $f28, $f4, $f24 \n\t"
"and $f30, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t"
"pandn $f8, $f4, $f24 \n\t"
"pandn $f10, $f6, $f26 \n\t"
"or $f28, $f28, $f8 \n\t"
"or $f30, $f30, $f10 \n\t"
"gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f28, $f30 \n\t"
"gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
"or $f8, $f8, $f28 \n\t"
"or $f10, $f10, $f30 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"and $f16, $f0, $f8 \n\t"
"and $f18, $f2, $f10 \n\t"
"paddh $f20, $f20, $f24 \n\t"
"paddh $f22, $f22, $f26 \n\t"
"gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
"pandn $f8, $f0, $f28 \n\t"
"pandn $f10, $f2, $f30 \n\t"
"or $f16, $f16, $f8 \n\t"
"or $f18, $f18, $f10 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
"dli %[iAlpha], 0x3 \n\t"
"psllh $f20, $f20, $f28 \n\t"
"psllh $f22, $f22, $f28 \n\t"
"paddh $f20, $f20, $f12 \n\t"
"paddh $f22, $f22, $f14 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
"paddh $f8, $f8, $f20 \n\t"
"paddh $f10, $f10, $f22 \n\t"
"psrah $f8, $f8, $f28 \n\t"
"psrah $f10, $f10, $f28 \n\t"
"gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
"and $f16, $f16, $f8 \n\t"
"and $f18, $f18, $f10 \n\t"
"gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
"paddh $f20, $f8, $f8 \n\t"
"paddh $f22, $f10, $f10 \n\t"
"gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
"paddh $f8, $f8, $f28 \n\t"
"paddh $f10, $f10, $f30 \n\t"
"dli %[iAlpha], 0x2 \n\t"
"paddh $f20, $f20, $f8 \n\t"
"paddh $f22, $f22, $f10 \n\t"
"gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
"paddh $f20, $f20, $f28 \n\t"
"paddh $f22, $f22, $f30 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
"psrah $f20, $f20, $f28 \n\t"
"psrah $f22, $f22, $f28 \n\t"
"pandn $f12, $f12, $f20 \n\t"
"pandn $f14, $f14, $f22 \n\t"
"or $f16, $f16, $f12 \n\t"
"or $f18, $f18, $f14 \n\t"
"gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
"or $f12, $f12, $f28 \n\t"
"or $f14, $f14, $f30 \n\t"
"and $f28, $f4, $f16 \n\t"
"and $f30, $f6, $f18 \n\t"
"gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
"pandn $f8, $f4, $f16 \n\t"
"pandn $f10, $f6, $f18 \n\t"
"or $f28, $f28, $f8 \n\t"
"or $f30, $f30, $f10 \n\t"
"gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
"paddh $f16, $f16, $f8 \n\t"
"paddh $f18, $f18, $f10 \n\t"
"gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f28, $f30 \n\t"
"dli %[iAlpha], 0x2 \n\t"
"gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t"
"and $f8, $f0, $f12 \n\t"
"and $f10, $f2, $f14 \n\t"
"gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t"
"pandn $f12, $f0, $f28 \n\t"
"pandn $f14, $f2, $f30 \n\t"
"or $f8, $f8, $f12 \n\t"
"or $f10, $f10, $f14 \n\t"
"gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
"paddh $f12, $f8, $f28 \n\t"
"paddh $f14, $f10, $f30 \n\t"
"paddh $f12, $f12, $f16 \n\t"
"paddh $f14, $f14, $f18 \n\t"
"gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
"paddh $f12, $f12, $f28 \n\t"
"paddh $f14, $f14, $f30 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"psrah $f12, $f12, $f28 \n\t"
"psrah $f14, $f14, $f28 \n\t"
"and $f24, $f24, $f12 \n\t"
"and $f26, $f26, $f14 \n\t"
"gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
"pandn $f16, $f12, $f20 \n\t"
"pandn $f18, $f14, $f22 \n\t"
"or $f24, $f24, $f16 \n\t"
"or $f26, $f26, $f18 \n\t"
"and $f28, $f4, $f24 \n\t"
"and $f30, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t"
"pandn $f16, $f4, $f20 \n\t"
"pandn $f18, $f6, $f22 \n\t"
"or $f28, $f28, $f16 \n\t"
"or $f30, $f30, $f18 \n\t"
"dli %[iAlpha], 0x1 \n\t"
"packushb $f24, $f24, $f26 \n\t"
"packushb $f26, $f28, $f30 \n\t"
"gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t"
"or $f28, $f28, $f16 \n\t"
"or $f30, $f30, $f18 \n\t"
"and $f16, $f0, $f28 \n\t"
"and $f18, $f2, $f30 \n\t"
"gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
"pandn $f0, $f0, $f28 \n\t"
"pandn $f2, $f2, $f30 \n\t"
"or $f16, $f16, $f0 \n\t"
"or $f18, $f18, $f2 \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
"gslqc1 $f2, $f0, 0x0($12) \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"punpcklbh $f0, $f2, $f30 \n\t"
"punpckhbh $f2, $f2, $f30 \n\t"
"psllh $f0, $f0, $f28 \n\t"
"psllh $f2, $f2, $f28 \n\t"
"paddh $f0, $f0, $f8 \n\t"
"paddh $f2, $f2, $f10 \n\t"
"paddh $f0, $f0, $f8 \n\t"
"paddh $f2, $f2, $f10 \n\t"
"paddh $f0, $f0, $f8 \n\t"
"paddh $f2, $f2, $f10 \n\t"
"paddh $f0, $f0, $f20 \n\t"
"paddh $f2, $f2, $f22 \n\t"
"dli %[iAlpha], 0x3 \n\t"
"gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
"paddh $f0, $f0, $f28 \n\t"
"paddh $f2, $f2, $f30 \n\t"
"gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
"paddh $f0, $f0, $f28 \n\t"
"paddh $f2, $f2, $f30 \n\t"
"gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
"paddh $f0, $f0, $f28 \n\t"
"paddh $f2, $f2, $f30 \n\t"
"dmtc1 %[iAlpha], $f28 \n\t"
"psrah $f0, $f0, $f28 \n\t"
"psrah $f2, $f2, $f28 \n\t"
"and $f0, $f0, $f12 \n\t"
"and $f2, $f2, $f14 \n\t"
"pandn $f12, $f12, $f8 \n\t"
"pandn $f14, $f14, $f10 \n\t"
"or $f0, $f0, $f12 \n\t"
"or $f2, $f2, $f14 \n\t"
"and $f28, $f4, $f0 \n\t"
"and $f30, $f6, $f2 \n\t"
"gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t"
"gssqc1 $f2, $f0, 0x0($11) \n\t"
"gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t"
"gssqc1 $f2, $f0, 0x0($8) \n\t"
"gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t"
"gssqc1 $f2, $f0, 0x0($9) \n\t"
"gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t"
"pandn $f4, $f4, $f8 \n\t"
"pandn $f6, $f6, $f10 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
"or $f28, $f28, $f4 \n\t"
"or $f30, $f30, $f6 \n\t"
"packushb $f16, $f16, $f18 \n\t"
"packushb $f18, $f28, $f30 \n\t"
"gssqc1 $f26, $f24, 0x0($13) \n\t"
"gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t"
: [pPix]"+&r"((unsigned char *)pPix)
: [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
[iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
"$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
"$f22", "$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
unsigned char tmp[256] __attribute__((aligned(32)));
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"lb $8, 0x2(%[pTC]) \n\t"
"lb $9, 0x3(%[pTC]) \n\t"
"move $11, $8 \n\t"
"lb $8, 0x1(%[pTC]) \n\t"
"lb %[pTC], 0x0(%[pTC]) \n\t"
"move $12, %[pTC] \n\t"
"and %[pTC], $9, 0xFFFF \n\t"
"dmtc1 %[pTC], $f4 \n\t"
"and %[pTC], $9, 0xFFFF \n\t"
"dmtc1 %[pTC], $f8 \n\t"
"move %[pTC], $11 \n\t"
"and $9, %[pTC], 0xFFFF \n\t"
"and %[pTC], %[pTC], 0xFFFF \n\t"
"dmtc1 %[pTC], $f16 \n\t"
"and %[pTC], $8, 0xFFFF \n\t"
"dmtc1 %[pTC], $f20 \n\t"
"dmtc1 $9, $f12 \n\t"
"and %[pTC], $8, 0xFFFF \n\t"
"dmtc1 %[pTC], $f24 \n\t"
"move %[pTC], $12 \n\t"
"and $9, %[pTC], 0xFFFF \n\t"
"and %[pTC], %[pTC], 0xFFFF \n\t"
"punpcklhw $f24, $f24, $f8 \n\t"
"xor $f0, $f0, $f0 \n\t"
"xor $f2, $f2, $f2 \n\t"
"gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
"dmtc1 $9, $f28 \n\t"
"dmtc1 %[pTC], $f0 \n\t"
"daddu %[pTC], %[iStride], %[iStride] \n\t"
"dsubu $9, %[pPixCb], %[pTC] \n\t"
"punpcklhw $f20, $f20, $f4 \n\t"
"gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
"punpcklhw $f0, $f0, $f16 \n\t"
"gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t"
"punpcklhw $f28, $f28, $f12 \n\t"
"gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
"punpcklhw $f0, $f0, $f24 \n\t"
"gsldxc1 $f24, 0x0($9, $0) \n\t"
"punpcklhw $f28, $f28, $f20 \n\t"
"punpckhhw $f2, $f0, $f28 \n\t"
"punpcklhw $f0, $f0, $f28 \n\t"
"dsubu $9, %[pPixCr], %[pTC] \n\t"
"psubh $f8, $f4, $f0 \n\t"
"psubh $f10, $f6, $f2 \n\t"
"gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
"gsldxc1 $f8, 0x0($9, $0) \n\t"
"mov.d $f26, $f8 \n\t"
"dsubu %[pTC], %[pPixCb], %[iStride] \n\t"
"gsldxc1 $f28, 0x0(%[pTC], $0) \n\t"
"dsubu $9, %[pPixCr], %[iStride] \n\t"
"gsldxc1 $f8, 0x0($9, $0) \n\t"
"mov.d $f30, $f8 \n\t"
"gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t"
"mov.d $f14, $f8 \n\t"
"gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t"
"mov.d $f10, $f16 \n\t"
"gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f8 \n\t"
"punpcklhw $f16, $f8, $f8 \n\t"
"dmtc1 %[iBeta], $f8 \n\t"
"punpcklhw $f20, $f8, $f8 \n\t"
"punpcklwd $f8, $f20, $f20 \n\t"
"mov.d $f10, $f8 \n\t"
"gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
"punpckhbh $f10, $f24, $f4 \n\t"
"punpcklbh $f8, $f24, $f4 \n\t"
"gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t"
"punpcklwd $f16, $f16, $f16 \n\t"
"mov.d $f18, $f16 \n\t"
"gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t"
"mov.d $f8, $f28 \n\t"
"mov.d $f10, $f30 \n\t"
"punpcklbh $f28, $f30, $f6 \n\t"
"punpckhbh $f30, $f30, $f6 \n\t"
"punpckhbh $f22, $f20, $f4 \n\t"
"punpcklbh $f20, $f20, $f4 \n\t"
"gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t"
"punpckhbh $f14, $f12, $f4 \n\t"
"punpcklbh $f12, $f12, $f4 \n\t"
"dli %[iBeta], 0x4 \n\t"
"punpckhbh $f10, $f8, $f4 \n\t"
"punpcklbh $f8, $f8, $f4 \n\t"
"dmtc1 %[iBeta], $f24 \n\t"
"punpcklhw $f28, $f24, $f24 \n\t"
"punpcklwd $f24, $f28, $f28 \n\t"
"mov.d $f26, $f24 \n\t"
"gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
"psubh $f28, $f28, $f20 \n\t"
"psubh $f30, $f30, $f22 \n\t"
"pcmpgth $f24, $f0, $f4 \n\t"
"pcmpgth $f26, $f2, $f6 \n\t"
"gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
"psubh $f24, $f12, $f8 \n\t"
"psubh $f26, $f14, $f10 \n\t"
"dmfc1 %[iAlpha], $f12 \n\t"
"dmfc1 %[iBeta], $f14 \n\t"
"dli $10, 0x2 \n\t"
"dmtc1 $10, $f12 \n\t"
"dli $10, 0x3 \n\t"
"dmtc1 $10, $f14 \n\t"
"psllh $f24, $f24, $f12 \n\t"
"psllh $f26, $f26, $f12 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"psrah $f24, $f24, $f14 \n\t"
"psrah $f26, $f26, $f14 \n\t"
"dmtc1 %[iAlpha], $f12 \n\t"
"dmtc1 %[iBeta], $f14 \n\t"
"pmaxsh $f4, $f4, $f24 \n\t"
"pmaxsh $f6, $f6, $f26 \n\t"
"gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
"pminsh $f24, $f24, $f4 \n\t"
"pminsh $f26, $f26, $f6 \n\t"
"gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
"psubh $f4, $f8, $f12 \n\t"
"psubh $f6, $f10, $f14 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
"pcmpgth $f24, $f16, $f4 \n\t"
"pcmpgth $f26, $f18, $f6 \n\t"
"gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
"psubh $f4, $f4, $f8 \n\t"
"psubh $f6, $f6, $f10 \n\t"
"dmfc1 %[iAlpha], $f8 \n\t"
"dmfc1 %[iBeta], $f10 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
"pcmpgth $f28, $f28, $f4 \n\t"
"pcmpgth $f30, $f30, $f6 \n\t"
"gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
"and $f24, $f24, $f28 \n\t"
"and $f26, $f26, $f30 \n\t"
"gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"psubh $f20, $f20, $f12 \n\t"
"psubh $f22, $f22, $f14 \n\t"
WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
"pcmpgth $f4, $f4, $f20 \n\t"
"pcmpgth $f6, $f6, $f22 \n\t"
"gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t"
"gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t"
"psubh $f20, $f20, $f8 \n\t"
"psubh $f22, $f22, $f10 \n\t"
"and $f24, $f24, $f4 \n\t"
"and $f26, $f26, $f6 \n\t"
"gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
"and $f24, $f24, $f8 \n\t"
"and $f26, $f26, $f10 \n\t"
"gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
"and $f4, $f4, $f24 \n\t"
"and $f6, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
"gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
"gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t"
"psubh $f24, $f24, $f4 \n\t"
"psubh $f26, $f26, $f6 \n\t"
"dli $10, 0x2 \n\t"
"dmtc1 $10, $f8 \n\t"
"psllh $f24, $f24, $f8 \n\t"
"psllh $f26, $f26, $f8 \n\t"
"paddh $f24, $f24, $f20 \n\t"
"paddh $f26, $f26, $f22 \n\t"
"dli $10, 0x3 \n\t"
"gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
"paddh $f24, $f24, $f8 \n\t"
"paddh $f26, $f26, $f10 \n\t"
"dmtc1 $10, $f8 \n\t"
"gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t"
"psrah $f24, $f24, $f8 \n\t"
"psrah $f26, $f26, $f8 \n\t"
"pmaxsh $f20, $f20, $f24 \n\t"
"pmaxsh $f22, $f22, $f26 \n\t"
"pminsh $f0, $f0, $f20 \n\t"
"pminsh $f2, $f2, $f22 \n\t"
"gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t"
"psubh $f24, $f4, $f20 \n\t"
"psubh $f26, $f6, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
"pcmpgth $f16, $f16, $f24 \n\t"
"pcmpgth $f18, $f18, $f26 \n\t"
"gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
"psubh $f24, $f24, $f4 \n\t"
"psubh $f26, $f26, $f6 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
"pcmpgth $f28, $f28, $f24 \n\t"
"pcmpgth $f30, $f30, $f26 \n\t"
"gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
"and $f16, $f16, $f28 \n\t"
"and $f18, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"psubh $f24, $f24, $f20 \n\t"
"psubh $f26, $f26, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
"dmtc1 %[iAlpha], $f8 \n\t"
"dmtc1 %[iBeta], $f10 \n\t"
"pcmpgth $f28, $f28, $f24 \n\t"
"pcmpgth $f30, $f30, $f26 \n\t"
"and $f16, $f16, $f28 \n\t"
"and $f18, $f18, $f30 \n\t"
"gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
"and $f16, $f16, $f24 \n\t"
"and $f18, $f18, $f26 \n\t"
"and $f0, $f0, $f16 \n\t"
"and $f2, $f2, $f18 \n\t"
"gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t"
"paddh $f8, $f8, $f16 \n\t"
"paddh $f10, $f10, $f18 \n\t"
"paddh $f4, $f4, $f0 \n\t"
"paddh $f6, $f6, $f2 \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f4, $f6 \n\t"
"gssdxc1 $f8, 0x0(%[pTC], $0) \n\t"
"psubh $f12, $f12, $f16 \n\t"
"psubh $f14, $f14, $f18 \n\t"
"psubh $f20, $f20, $f0 \n\t"
"psubh $f22, $f22, $f2 \n\t"
"packushb $f12, $f12, $f14 \n\t"
"packushb $f14, $f20, $f22 \n\t"
"gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
"gssdxc1 $f10, 0x0($9, $0) \n\t"
"gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t"
: [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
: [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
[pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
: "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
"$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
"$f28", "$f30"
);
RECOVER_REG;
}
void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
int32_t iAlpha, int32_t iBeta) {
unsigned char tmp[128] __attribute__((aligned(32)));
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"daddu $8, %[iStride], %[iStride] \n\t"
"dsubu $9, %[pPixCb], $8 \n\t"
"gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t"
"gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t"
"gsldxc1 $f4, 0x0($9, $0) \n\t"
"dsubu $9, %[pPixCr], $8 \n\t"
"gsldxc1 $f8, 0x0($9, $0) \n\t"
"mov.d $f6, $f8 \n\t"
"dsubu $8, %[pPixCb], %[iStride] \n\t"
"gsldxc1 $f8, 0x0($8, $0) \n\t"
"dsubu $9, %[pPixCr], %[iStride] \n\t"
"gsldxc1 $f12, 0x0($9, $0) \n\t"
"mov.d $f10, $f12 \n\t"
"gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
"mov.d $f14, $f16 \n\t"
"gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t"
"mov.d $f18, $f20 \n\t"
"dmtc1 %[iAlpha], $f20 \n\t"
"xor $f0, $f0, $f0 \n\t"
"xor $f2, $f2, $f2 \n\t"
"punpcklhw $f24, $f20, $f20 \n\t"
"punpcklwd $f20, $f24, $f24 \n\t"
"mov.d $f22, $f20 \n\t"
"dmtc1 %[iBeta], $f24 \n\t"
"punpcklhw $f28, $f24, $f24 \n\t"
"punpcklwd $f24, $f28, $f28 \n\t"
"mov.d $f26, $f24 \n\t"
"mov.d $f28, $f4 \n\t"
"punpcklbh $f4, $f6, $f2 \n\t"
"punpckhbh $f6, $f6, $f2 \n\t"
"punpckhbh $f30, $f28, $f0 \n\t"
"punpcklbh $f28, $f28, $f0 \n\t"
"gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
"gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
"punpckhbh $f30, $f8, $f0 \n\t"
"punpcklbh $f28, $f8, $f0 \n\t"
"gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t"
"punpckhbh $f30, $f12, $f0 \n\t"
"punpcklbh $f28, $f12, $f0 \n\t"
"punpcklbh $f12, $f14, $f2 \n\t"
"punpckhbh $f14, $f14, $f2 \n\t"
"gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"mov.d $f28, $f16 \n\t"
"punpcklbh $f16, $f18, $f2 \n\t"
"punpckhbh $f18, $f18, $f2 \n\t"
"punpcklbh $f8, $f10, $f2 \n\t"
"punpckhbh $f10, $f10, $f2 \n\t"
"punpckhbh $f30, $f28, $f0 \n\t"
"punpcklbh $f28, $f28, $f0 \n\t"
"gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t"
"gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t"
"psubh $f4, $f12, $f0 \n\t"
"psubh $f6, $f14, $f2 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
"gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
"pcmpgth $f0, $f20, $f4 \n\t"
"pcmpgth $f2, $f22, $f6 \n\t"
"gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
"psubh $f4, $f4, $f12 \n\t"
"psubh $f6, $f6, $f14 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
"pcmpgth $f16, $f24, $f4 \n\t"
"pcmpgth $f18, $f26, $f6 \n\t"
"and $f0, $f0, $f16 \n\t"
"and $f2, $f2, $f18 \n\t"
"gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
"psubh $f4, $f28, $f16 \n\t"
"psubh $f6, $f30, $f18 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
"pcmpgth $f16, $f24, $f4 \n\t"
"pcmpgth $f18, $f26, $f6 \n\t"
"gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
"psubh $f4, $f8, $f4 \n\t"
"psubh $f6, $f10, $f6 \n\t"
"dmfc1 %[iAlpha], $f28 \n\t"
"dmfc1 %[iBeta], $f30 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
"pcmpgth $f20, $f20, $f4 \n\t"
"pcmpgth $f22, $f22, $f6 \n\t"
"gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
"and $f0, $f0, $f16 \n\t"
"and $f2, $f2, $f18 \n\t"
"psubh $f4, $f4, $f8 \n\t"
"psubh $f6, $f6, $f10 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
"pcmpgth $f16, $f24, $f4 \n\t"
"pcmpgth $f18, $f26, $f6 \n\t"
"gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
"psubh $f4, $f4, $f28 \n\t"
"psubh $f6, $f6, $f30 \n\t"
"and $f20, $f20, $f16 \n\t"
"and $f22, $f22, $f18 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
"dmtc1 %[iAlpha], $f28 \n\t"
"dmtc1 %[iBeta], $f30 \n\t"
"pcmpgth $f24, $f24, $f4 \n\t"
"pcmpgth $f26, $f26, $f6 \n\t"
"and $f20, $f20, $f24 \n\t"
"and $f22, $f22, $f26 \n\t"
"dli %[iBeta], 0x2 \n\t"
"dmtc1 %[iBeta], $f4 \n\t"
"punpcklhw $f16, $f4, $f4 \n\t"
"punpcklwd $f4, $f16, $f16 \n\t"
"mov.d $f6, $f4 \n\t"
"gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t"
"paddh $f24, $f16, $f16 \n\t"
"paddh $f26, $f18, $f18 \n\t"
"paddh $f24, $f24, $f12 \n\t"
"paddh $f26, $f26, $f14 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t"
"paddh $f24, $f24, $f16 \n\t"
"paddh $f26, $f26, $f18 \n\t"
"dmtc1 %[iBeta], $f16 \n\t"
"psrah $f24, $f24, $f16 \n\t"
"psrah $f26, $f26, $f16 \n\t"
"pandn $f16, $f0, $f12 \n\t"
"pandn $f18, $f2, $f14 \n\t"
"gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t"
"and $f4, $f0, $f24 \n\t"
"and $f6, $f2, $f26 \n\t"
"or $f4, $f4, $f16 \n\t"
"or $f6, $f6, $f18 \n\t"
"paddh $f24, $f12, $f12 \n\t"
"paddh $f26, $f14, $f14 \n\t"
"gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
"paddh $f24, $f24, $f8 \n\t"
"paddh $f26, $f26, $f10 \n\t"
"gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
"paddh $f24, $f24, $f16 \n\t"
"paddh $f26, $f26, $f18 \n\t"
"dmtc1 %[iBeta], $f16 \n\t"
"paddh $f24, $f24, $f12 \n\t"
"paddh $f26, $f26, $f14 \n\t"
"psrah $f24, $f24, $f16 \n\t"
"psrah $f26, $f26, $f16 \n\t"
"and $f16, $f20, $f24 \n\t"
"and $f18, $f22, $f26 \n\t"
"pandn $f24, $f20, $f8 \n\t"
"pandn $f26, $f22, $f10 \n\t"
"or $f16, $f16, $f24 \n\t"
"or $f18, $f18, $f26 \n\t"
"packushb $f4, $f4, $f6 \n\t"
"packushb $f6, $f16, $f18 \n\t"
"gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
"paddh $f24, $f28, $f28 \n\t"
"paddh $f26, $f30, $f30 \n\t"
"paddh $f24, $f24, $f16 \n\t"
"paddh $f26, $f26, $f18 \n\t"
"gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
"paddh $f24, $f24, $f8 \n\t"
"paddh $f26, $f26, $f10 \n\t"
"dmtc1 %[iBeta], $f28 \n\t"
"paddh $f24, $f24, $f12 \n\t"
"paddh $f26, $f26, $f14 \n\t"
"psrah $f24, $f24, $f28 \n\t"
"psrah $f26, $f26, $f28 \n\t"
"and $f8, $f0, $f24 \n\t"
"and $f10, $f2, $f26 \n\t"
"pandn $f0, $f0, $f16 \n\t"
"pandn $f2, $f2, $f18 \n\t"
"or $f8, $f8, $f0 \n\t"
"or $f10, $f10, $f2 \n\t"
"gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
"paddh $f24, $f0, $f0 \n\t"
"paddh $f26, $f2, $f2 \n\t"
"gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
"paddh $f24, $f24, $f0 \n\t"
"paddh $f26, $f26, $f2 \n\t"
"gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
"paddh $f24, $f24, $f16 \n\t"
"paddh $f26, $f26, $f18 \n\t"
"paddh $f24, $f24, $f12 \n\t"
"paddh $f26, $f26, $f14 \n\t"
"gssdxc1 $f4, 0x0($8, $0) \n\t"
"psrah $f24, $f24, $f28 \n\t"
"psrah $f26, $f26, $f28 \n\t"
"and $f16, $f20, $f24 \n\t"
"and $f18, $f22, $f26 \n\t"
"pandn $f20, $f20, $f0 \n\t"
"pandn $f22, $f22, $f2 \n\t"
"or $f16, $f16, $f20 \n\t"
"or $f18, $f18, $f22 \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f16, $f18 \n\t"
"gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t"
"gssdxc1 $f6, 0x0($9, $0) \n\t"
"gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t"
: [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
: [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
[iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
: "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
"$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
"$f28", "$f30"
);
RECOVER_REG;
}
void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
int32_t iAlpha, int32_t iBeta) {
unsigned char tmp[256] __attribute__((aligned(32)));
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
"daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
"move $9, %[pPixCb] \n\t"
"move $10, %[pPixCr] \n\t"
"dsll $11, %[iStride], 0x2 \n\t"
"daddu %[pPixCb], %[pPixCb], $11 \n\t"
"daddu %[pPixCr], %[pPixCr], $11 \n\t"
"daddiu $11, %[tmp], 0x80 \n\t"
"gsldlc1 $f0, 0x7($9) \n\t"
"gsldrc1 $f0, 0x0($9) \n\t"
"daddu $12, $9, %[iStride] \n\t"
"gsldlc1 $f4, 0x7($12) \n\t"
"gsldrc1 $f4, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsldlc1 $f8, 0x7($12) \n\t"
"gsldrc1 $f8, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsldlc1 $f12, 0x7($12) \n\t"
"gsldlc1 $f16, 0x7($10) \n\t"
"gsldrc1 $f12, 0x0($12) \n\t"
"gsldrc1 $f16, 0x0($10) \n\t"
"daddu $12, $10, %[iStride] \n\t"
"gsldlc1 $f20, 0x7($12) \n\t"
"gsldrc1 $f20, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsldlc1 $f24, 0x7($12) \n\t"
"gsldrc1 $f24, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsldlc1 $f28, 0x7($12) \n\t"
"gsldrc1 $f28, 0x0($12) \n\t"
"punpcklwd $f0, $f0, $f16 \n\t"
"punpcklwd $f4, $f4, $f20 \n\t"
"punpcklwd $f8, $f8, $f24 \n\t"
"punpcklwd $f12, $f12, $f28 \n\t"
"gsldlc1 $f16, 0x7(%[pPixCb]) \n\t"
"gsldlc1 $f20, 0x7(%[pPixCr]) \n\t"
"gsldrc1 $f16, 0x0(%[pPixCb]) \n\t"
"gsldrc1 $f20, 0x0(%[pPixCr]) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f2, $f16 \n\t"
"daddu $12, %[pPixCb], %[iStride] \n\t"
"daddu $13, %[pPixCr], %[iStride] \n\t"
"gsldlc1 $f16, 0x7($12) \n\t"
"gsldlc1 $f20, 0x7($13) \n\t"
"gsldrc1 $f16, 0x0($12) \n\t"
"gsldrc1 $f20, 0x0($13) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f6, $f16 \n\t"
"daddu $12, $12, %[iStride] \n\t"
"daddu $13, $13, %[iStride] \n\t"
"gsldlc1 $f16, 0x7($12) \n\t"
"gsldlc1 $f20, 0x7($13) \n\t"
"gsldrc1 $f16, 0x0($12) \n\t"
"gsldrc1 $f20, 0x0($13) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f10, $f16 \n\t"
"daddu $12, $12, %[iStride] \n\t"
"daddu $13, $13, %[iStride] \n\t"
"gsldlc1 $f16, 0x7($12) \n\t"
"gsldlc1 $f20, 0x7($13) \n\t"
"gsldrc1 $f16, 0x0($12) \n\t"
"gsldrc1 $f20, 0x0($13) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f14, $f16 \n\t"
"punpcklbh $f24, $f2, $f6 \n\t"
"punpckhbh $f26, $f2, $f6 \n\t"
"punpckhbh $f2, $f0, $f4 \n\t"
"punpcklbh $f0, $f0, $f4 \n\t"
"punpcklbh $f28, $f10, $f14 \n\t"
"punpckhbh $f30, $f10, $f14 \n\t"
"punpckhbh $f10, $f8, $f12 \n\t"
"punpcklbh $f8, $f8, $f12 \n\t"
"punpcklhw $f16, $f2, $f10 \n\t"
"punpckhhw $f18, $f2, $f10 \n\t"
"punpckhhw $f2, $f0, $f8 \n\t"
"punpcklhw $f0, $f0, $f8 \n\t"
"punpcklhw $f20, $f26, $f30 \n\t"
"punpckhhw $f22, $f26, $f30 \n\t"
"punpckhhw $f26, $f24, $f28 \n\t"
"punpcklhw $f24, $f24, $f28 \n\t"
"punpcklwd $f4, $f2, $f26 \n\t"
"punpckhwd $f6, $f2, $f26 \n\t"
"punpckhwd $f2, $f0, $f24 \n\t"
"punpcklwd $f0, $f0, $f24 \n\t"
"punpcklwd $f8, $f18, $f22 \n\t"
"punpckhwd $f10, $f18, $f22 \n\t"
"punpckhwd $f18, $f16, $f20 \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f20, $f2 \n\t"
"mov.d $f22, $f18 \n\t"
"mov.d $f2, $f16 \n\t"
"mov.d $f24, $f6 \n\t"
"mov.d $f26, $f10 \n\t"
"mov.d $f6, $f8 \n\t"
"gssqc1 $f2, $f0, 0x0($11) \n\t"
"gssqc1 $f22, $f20, 0x10($11) \n\t"
"gssqc1 $f6, $f4, 0x20($11) \n\t"
"gssqc1 $f26, $f24, 0x30($11) \n\t"
"gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
"gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t"
"xor $f0, $f0, $f0 \n\t"
"dmtc1 %[iAlpha], $f4 \n\t"
"punpcklhw $f8, $f4, $f4 \n\t"
"punpcklwd $f4, $f8, $f8 \n\t"
"mov.d $f6, $f4 \n\t"
"dmtc1 %[iBeta], $f8 \n\t"
"punpcklhw $f12, $f8, $f8 \n\t"
"punpcklwd $f8, $f12, $f12 \n\t"
"mov.d $f10, $f8 \n\t"
"mov.d $f12, $f24 \n\t"
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f0 \n\t"
"punpckhbh $f26, $f26, $f0 \n\t"
"gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
"punpckhbh $f30, $f28, $f0 \n\t"
"punpcklbh $f28, $f28, $f0 \n\t"
"punpckhbh $f18, $f16, $f0 \n\t"
"punpcklbh $f16, $f16, $f0 \n\t"
"punpckhbh $f22, $f20, $f0 \n\t"
"punpcklbh $f20, $f20, $f0 \n\t"
"punpckhbh $f14, $f12, $f0 \n\t"
"punpcklbh $f12, $f12, $f0 \n\t"
"gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"psubh $f24, $f16, $f20 \n\t"
"psubh $f26, $f18, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
"pcmpgth $f0, $f4, $f24 \n\t"
"pcmpgth $f2, $f6, $f26 \n\t"
"psubh $f24, $f12, $f16 \n\t"
"psubh $f26, $f14, $f18 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
"pcmpgth $f28, $f8, $f24 \n\t"
"pcmpgth $f30, $f10, $f26 \n\t"
"gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t"
"psubh $f24, $f24, $f20 \n\t"
"psubh $f26, $f26, $f22 \n\t"
"and $f0, $f0, $f28 \n\t"
"and $f2, $f2, $f30 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
"dmfc1 %[iAlpha], $f20 \n\t"
"dmfc1 %[iBeta], $f22 \n\t"
"pcmpgth $f28, $f8, $f24 \n\t"
"pcmpgth $f30, $f10, $f26 \n\t"
"gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
"psubh $f24, $f24, $f20 \n\t"
"psubh $f26, $f26, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
"pcmpgth $f4, $f4, $f24 \n\t"
"pcmpgth $f6, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
"psubh $f24, $f24, $f20 \n\t"
"psubh $f26, $f26, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
"and $f0, $f0, $f28 \n\t"
"and $f2, $f2, $f30 \n\t"
"pcmpgth $f28, $f8, $f24 \n\t"
"pcmpgth $f30, $f10, $f26 \n\t"
"gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
"gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
"psubh $f24, $f24, $f20 \n\t"
"psubh $f26, $f26, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
"dli $8, 0x2 \n\t"
"and $f4, $f4, $f28 \n\t"
"and $f6, $f6, $f30 \n\t"
"pcmpgth $f8, $f8, $f24 \n\t"
"pcmpgth $f10, $f10, $f26 \n\t"
"and $f4, $f4, $f8 \n\t"
"and $f6, $f6, $f10 \n\t"
"dmtc1 $8, $f8 \n\t"
"punpcklhw $f24, $f8, $f8 \n\t"
"punpcklwd $f8, $f24, $f24 \n\t"
"mov.d $f10, $f8 \n\t"
"gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
"paddh $f8, $f12, $f12 \n\t"
"paddh $f10, $f14, $f14 \n\t"
"paddh $f8, $f8, $f16 \n\t"
"paddh $f10, $f10, $f18 \n\t"
"gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t"
"paddh $f8, $f8, $f20 \n\t"
"paddh $f10, $f10, $f22 \n\t"
"gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
"paddh $f8, $f8, $f24 \n\t"
"paddh $f10, $f10, $f26 \n\t"
"dmtc1 $8, $f20 \n\t"
"psrah $f8, $f8, $f20 \n\t"
"psrah $f10, $f10, $f20 \n\t"
"and $f24, $f0, $f8 \n\t"
"and $f26, $f2, $f10 \n\t"
"pandn $f8, $f0, $f16 \n\t"
"pandn $f10, $f2, $f18 \n\t"
"or $f24, $f24, $f8 \n\t"
"or $f26, $f26, $f10 \n\t"
"gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
"paddh $f28, $f8, $f8 \n\t"
"paddh $f30, $f10, $f10 \n\t"
"gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
"paddh $f28, $f28, $f20 \n\t"
"paddh $f30, $f30, $f22 \n\t"
"gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t"
"paddh $f28, $f28, $f16 \n\t"
"paddh $f30, $f30, $f18 \n\t"
"gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
"paddh $f28, $f28, $f8 \n\t"
"paddh $f30, $f30, $f10 \n\t"
"pandn $f8, $f4, $f20 \n\t"
"pandn $f10, $f6, $f22 \n\t"
"dmtc1 $8, $f20 \n\t"
"psrah $f28, $f28, $f20 \n\t"
"psrah $f30, $f30, $f20 \n\t"
"and $f16, $f4, $f28 \n\t"
"and $f18, $f6, $f30 \n\t"
"or $f16, $f16, $f8 \n\t"
"or $f18, $f18, $f10 \n\t"
"gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
"packushb $f24, $f24, $f26 \n\t"
"packushb $f26, $f16, $f18 \n\t"
"gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
"paddh $f24, $f8, $f8 \n\t"
"paddh $f26, $f10, $f10 \n\t"
"dmtc1 %[iAlpha], $f20 \n\t"
"dmtc1 %[iBeta], $f22 \n\t"
"gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
"paddh $f24, $f24, $f20 \n\t"
"paddh $f26, $f26, $f22 \n\t"
"paddh $f24, $f24, $f12 \n\t"
"paddh $f26, $f26, $f14 \n\t"
"mov.d $f16, $f0 \n\t"
"mov.d $f18, $f2 \n\t"
"pandn $f0, $f0, $f20 \n\t"
"pandn $f2, $f2, $f22 \n\t"
"dmtc1 $8, $f20 \n\t"
"paddh $f24, $f24, $f8 \n\t"
"paddh $f26, $f26, $f10 \n\t"
"psrah $f24, $f24, $f20 \n\t"
"psrah $f26, $f26, $f20 \n\t"
"and $f16, $f16, $f24 \n\t"
"and $f18, $f18, $f26 \n\t"
"or $f16, $f16, $f0 \n\t"
"or $f18, $f18, $f2 \n\t"
"gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t"
"paddh $f20, $f0, $f0 \n\t"
"paddh $f22, $f2, $f2 \n\t"
"gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
"paddh $f20, $f20, $f0 \n\t"
"paddh $f22, $f22, $f2 \n\t"
"gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t"
"paddh $f20, $f20, $f12 \n\t"
"paddh $f22, $f22, $f14 \n\t"
"paddh $f20, $f20, $f8 \n\t"
"paddh $f22, $f22, $f10 \n\t"
"dmtc1 $8, $f8 \n\t"
"psrah $f20, $f20, $f8 \n\t"
"psrah $f22, $f22, $f8 \n\t"
"and $f12, $f4, $f20 \n\t"
"and $f14, $f6, $f22 \n\t"
"pandn $f4, $f4, $f0 \n\t"
"pandn $f6, $f6, $f2 \n\t"
"or $f12, $f12, $f4 \n\t"
"or $f14, $f14, $f6 \n\t"
"packushb $f16, $f16, $f18 \n\t"
"packushb $f18, $f12, $f14 \n\t"
"gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 0x0($11) \n\t"
"gslqc1 $f6, $f4, 0x10($11) \n\t"
"gslqc1 $f10, $f8, 0x20($11) \n\t"
"gslqc1 $f14, $f12, 0x30($11) \n\t"
"mov.d $f26, $f2 \n\t"
"punpckhbh $f2, $f0, $f4 \n\t"
"punpcklbh $f0, $f0, $f4 \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"mov.d $f30, $f10 \n\t"
"punpckhbh $f10, $f8, $f12 \n\t"
"punpcklbh $f8, $f8, $f12 \n\t"
"punpcklbh $f28, $f30, $f14 \n\t"
"punpckhbh $f30, $f30, $f14 \n\t"
"punpcklhw $f16, $f2, $f10 \n\t"
"punpckhhw $f18, $f2, $f10 \n\t"
"punpcklhw $f20, $f26, $f30 \n\t"
"punpckhhw $f22, $f26, $f30 \n\t"
"punpckhhw $f2, $f0, $f8 \n\t"
"punpcklhw $f0, $f0, $f8 \n\t"
"punpckhhw $f26, $f24, $f28 \n\t"
"punpcklhw $f24, $f24, $f28 \n\t"
"punpcklwd $f4, $f2, $f26 \n\t"
"punpckhwd $f6, $f2, $f26 \n\t"
"punpcklwd $f8, $f18, $f22 \n\t"
"punpckhwd $f10, $f18, $f22 \n\t"
"punpckhwd $f2, $f0, $f24 \n\t"
"punpcklwd $f0, $f0, $f24 \n\t"
"punpckhwd $f18, $f16, $f20 \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f20, $f2 \n\t"
"mov.d $f24, $f6 \n\t"
"mov.d $f2, $f16 \n\t"
"mov.d $f22, $f18 \n\t"
"mov.d $f6, $f8 \n\t"
"mov.d $f26, $f10 \n\t"
"dli %[iAlpha], 0x20 \n\t"
"dmtc1 %[iAlpha], $f8 \n\t"
"gsswlc1 $f0, 0x3($9) \n\t"
"gsswrc1 $f0, 0x0($9) \n\t"
"daddu $12, $9, %[iStride] \n\t"
"gsswlc1 $f20, 0x3($12) \n\t"
"gsswrc1 $f20, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsswlc1 $f4, 0x3($12) \n\t"
"gsswrc1 $f4, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsswlc1 $f24, 0x3($12) \n\t"
"gsswrc1 $f24, 0x0($12) \n\t"
"dsrl $f0, $f0, $f8 \n\t"
"dsrl $f20, $f20, $f8 \n\t"
"dsrl $f4, $f4, $f8 \n\t"
"dsrl $f24, $f24, $f8 \n\t"
"gsswlc1 $f0, 0x3($10) \n\t"
"gsswrc1 $f0, 0x0($10) \n\t"
"daddu $13, $10, %[iStride] \n\t"
"daddu $8, $13, %[iStride] \n\t"
"gsswlc1 $f20, 0x3($13) \n\t"
"gsswrc1 $f20, 0x0($13) \n\t"
"daddu $13, $8, %[iStride] \n\t"
"gsswlc1 $f4, 0x3($8) \n\t"
"gsswrc1 $f4, 0x0($8) \n\t"
"gsswlc1 $f24, 0x3($13) \n\t"
"gsswrc1 $f24, 0x0($13) \n\t"
"gsswlc1 $f2, 0x3(%[pPixCb]) \n\t"
"gsswrc1 $f2, 0x0(%[pPixCb]) \n\t"
"daddu $12, %[pPixCb], %[iStride] \n\t"
"gsswlc1 $f22, 0x3($12) \n\t"
"gsswrc1 $f22, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsswlc1 $f6, 0x3($12) \n\t"
"gsswrc1 $f6, 0x0($12) \n\t"
"daddu $12, $12, %[iStride] \n\t"
"gsswlc1 $f26, 0x3($12) \n\t"
"gsswrc1 $f26, 0x0($12) \n\t"
"dsrl $f2, $f2, $f8 \n\t"
"dsrl $f22, $f22, $f8 \n\t"
"dsrl $f6, $f6, $f8 \n\t"
"dsrl $f26, $f26, $f8 \n\t"
"gsswlc1 $f2, 0x3(%[pPixCr]) \n\t"
"gsswrc1 $f2, 0x0(%[pPixCr]) \n\t"
"daddu $13, %[pPixCr], %[iStride] \n\t"
"daddu $8, $13, %[iStride] \n\t"
"gsswlc1 $f22, 0x3($13) \n\t"
"gsswrc1 $f22, 0x0($13) \n\t"
"daddu $13, $8, %[iStride] \n\t"
"gsswlc1 $f6, 0x3($8) \n\t"
"gsswrc1 $f6, 0x0($8) \n\t"
"gsswlc1 $f26, 0x3($13) \n\t"
"gsswrc1 $f26, 0x0($13) \n\t"
: [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
: [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
[iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
"$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
"$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
unsigned char tmp[320] __attribute__((aligned(32)));
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
"daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
"daddu $8, %[pPixCb], %[iStride] \n\t"
"gsldlc1 $f0, 0x7(%[pPixCb]) \n\t"
"gsldlc1 $f4, 0x7($8) \n\t"
"gsldrc1 $f0, 0x0(%[pPixCb]) \n\t"
"gsldrc1 $f4, 0x0($8) \n\t"
"daddu $9, $8, %[iStride] \n\t"
"daddu $8, $9, %[iStride] \n\t"
"gsldlc1 $f8, 0x7($9) \n\t"
"gsldlc1 $f12, 0x7($8) \n\t"
"gsldrc1 $f8, 0x0($9) \n\t"
"gsldrc1 $f12, 0x0($8) \n\t"
"daddu $9, $8, %[iStride] \n\t"
"daddu $10, %[pPixCr], %[iStride] \n\t"
"gsldlc1 $f16, 0x7(%[pPixCr]) \n\t"
"gsldlc1 $f20, 0x7($10) \n\t"
"gsldrc1 $f16, 0x0(%[pPixCr]) \n\t"
"gsldrc1 $f20, 0x0($10) \n\t"
"daddu $11, $10, %[iStride] \n\t"
"daddu $10, $11, %[iStride] \n\t"
"gsldlc1 $f24, 0x7($11) \n\t"
"gsldlc1 $f28, 0x7($10) \n\t"
"gsldrc1 $f24, 0x0($11) \n\t"
"gsldrc1 $f28, 0x0($10) \n\t"
"daddu $11, $10, %[iStride] \n\t"
"punpcklwd $f0, $f0, $f16 \n\t"
"punpcklwd $f4, $f4, $f20 \n\t"
"punpcklwd $f8, $f8, $f24 \n\t"
"punpcklwd $f12, $f12, $f28 \n\t"
"gsldlc1 $f16, 0x7($9) \n\t"
"gsldlc1 $f20, 0x7($11) \n\t"
"gsldrc1 $f16, 0x0($9) \n\t"
"gsldrc1 $f20, 0x0($11) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f2, $f16 \n\t"
"daddu $8, $9, %[iStride] \n\t"
"daddu $10, $11, %[iStride] \n\t"
"gsldlc1 $f16, 0x7($8) \n\t"
"gsldlc1 $f20, 0x7($10) \n\t"
"gsldrc1 $f16, 0x0($8) \n\t"
"gsldrc1 $f20, 0x0($10) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f6, $f16 \n\t"
"daddu $9, $8, %[iStride] \n\t"
"daddu $11, $10, %[iStride] \n\t"
"gsldlc1 $f16, 0x7($9) \n\t"
"gsldlc1 $f20, 0x7($11) \n\t"
"gsldrc1 $f16, 0x0($9) \n\t"
"gsldrc1 $f20, 0x0($11) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f10, $f16 \n\t"
"daddu $8, $9, %[iStride] \n\t"
"daddu $10, $11, %[iStride] \n\t"
"gsldlc1 $f16, 0x7($8) \n\t"
"gsldlc1 $f20, 0x7($10) \n\t"
"gsldrc1 $f16, 0x0($8) \n\t"
"gsldrc1 $f20, 0x0($10) \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f14, $f16 \n\t"
"punpcklbh $f24, $f2, $f6 \n\t"
"punpckhbh $f26, $f2, $f6 \n\t"
"punpckhbh $f2, $f0, $f4 \n\t"
"punpcklbh $f0, $f0, $f4 \n\t"
"punpcklbh $f28, $f10, $f14 \n\t"
"punpckhbh $f30, $f10, $f14 \n\t"
"punpckhbh $f10, $f8, $f12 \n\t"
"punpcklbh $f8, $f8, $f12 \n\t"
"punpcklhw $f16, $f2, $f10 \n\t"
"punpckhhw $f18, $f2, $f10 \n\t"
"punpckhhw $f2, $f0, $f8 \n\t"
"punpcklhw $f0, $f0, $f8 \n\t"
"punpcklhw $f20, $f26, $f30 \n\t"
"punpckhhw $f22, $f26, $f30 \n\t"
"punpckhhw $f26, $f24, $f28 \n\t"
"punpcklhw $f24, $f24, $f28 \n\t"
"punpcklwd $f4, $f2, $f26 \n\t"
"punpckhwd $f6, $f2, $f26 \n\t"
"punpckhwd $f2, $f0, $f24 \n\t"
"punpcklwd $f0, $f0, $f24 \n\t"
"punpcklwd $f8, $f18, $f22 \n\t"
"punpckhwd $f10, $f18, $f22 \n\t"
"punpckhwd $f18, $f16, $f20 \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f20, $f2 \n\t"
"mov.d $f22, $f18 \n\t"
"mov.d $f2, $f16 \n\t"
"mov.d $f24, $f6 \n\t"
"mov.d $f26, $f10 \n\t"
"mov.d $f6, $f8 \n\t"
"daddiu $11, %[tmp], 0x70 \n\t"
"gssqc1 $f2, $f0, 0x0($11) \n\t"
"gssqc1 $f22, $f20, 0x10($11) \n\t"
"gssqc1 $f6, $f4, 0x20($11) \n\t"
"gssqc1 $f26, $f24, 0x30($11) \n\t"
"lb $8, 0x3(%[pTC]) \n\t"
"lb $9, 0x2(%[pTC]) \n\t"
"lb $10, 0x1(%[pTC]) \n\t"
"lb $11, 0x0(%[pTC]) \n\t"
"and $12, $8, 0xFFFF \n\t"
"dmtc1 $12, $f8 \n\t"
"and $9, $9, 0xFFFF \n\t"
"dmtc1 $9, $f12 \n\t"
"mov.d $f16, $f12 \n\t"
"and $9, $10, 0xFFFF \n\t"
"dmtc1 $9, $f20 \n\t"
"xor $f0, $f0, $f0 \n\t"
"mov.d $f24, $f20 \n\t"
"and $9, $11, 0xFFFF \n\t"
"punpcklhw $f24, $f24, $f8 \n\t"
"mov.d $f4, $f8 \n\t"
"dmtc1 $9, $f28 \n\t"
"mov.d $f0, $f28 \n\t"
"punpcklhw $f28, $f28, $f12 \n\t"
"punpcklhw $f20, $f20, $f4 \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"punpcklhw $f28, $f28, $f20 \n\t"
"gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t"
"punpcklhw $f0, $f0, $f16 \n\t"
"punpcklhw $f0, $f0, $f24 \n\t"
"gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
"punpckhhw $f2, $f0, $f28 \n\t"
"punpcklhw $f0, $f0, $f28 \n\t"
"gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t"
"psubh $f8, $f4, $f0 \n\t"
"psubh $f10, $f6, $f2 \n\t"
"gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f8 \n\t"
"punpcklhw $f12, $f8, $f8 \n\t"
"punpcklwd $f16, $f12, $f12 \n\t"
"mov.d $f18, $f16 \n\t"
"dmtc1 %[iBeta], $f8 \n\t"
"punpcklhw $f12, $f8, $f8 \n\t"
"punpcklwd $f8, $f12, $f12 \n\t"
"mov.d $f10, $f8 \n\t"
"gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
"gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
"punpckhbh $f10, $f24, $f4 \n\t"
"punpcklbh $f8, $f24, $f4 \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
"punpcklbh $f8, $f28, $f4 \n\t"
"punpckhbh $f10, $f28, $f4 \n\t"
"punpcklbh $f28, $f30, $f6 \n\t"
"punpckhbh $f30, $f30, $f6 \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"punpckhbh $f14, $f12, $f4 \n\t"
"punpcklbh $f12, $f12, $f4 \n\t"
"punpckhbh $f22, $f20, $f4 \n\t"
"punpcklbh $f20, $f20, $f4 \n\t"
"gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t"
"gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t"
"punpcklbh $f24, $f26, $f6 \n\t"
"punpckhbh $f26, $f26, $f6 \n\t"
"dli $13, 0x4 \n\t"
"gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
"dmtc1 $13, $f24 \n\t"
"punpcklhw $f28, $f24, $f24 \n\t"
"punpcklwd $f24, $f28, $f28 \n\t"
"mov.d $f26, $f24 \n\t"
"dli $12, 0x2 \n\t"
"dli $13, 0x3 \n\t"
"gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
"dmfc1 %[iAlpha], $f0 \n\t"
"dmfc1 %[iBeta], $f2 \n\t"
"gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
"gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t"
"psubh $f28, $f28, $f20 \n\t"
"psubh $f30, $f30, $f22 \n\t"
"pcmpgth $f24, $f0, $f4 \n\t"
"pcmpgth $f26, $f2, $f6 \n\t"
"dmtc1 $12, $f0 \n\t"
"dmtc1 $13, $f2 \n\t"
"gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
"gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t"
"psubh $f24, $f12, $f8 \n\t"
"psubh $f26, $f14, $f10 \n\t"
"psllh $f24, $f24, $f0 \n\t"
"psllh $f26, $f26, $f0 \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
"paddh $f24, $f24, $f28 \n\t"
"paddh $f26, $f26, $f30 \n\t"
"psrah $f24, $f24, $f2 \n\t"
"psrah $f26, $f26, $f2 \n\t"
"pmaxsh $f4, $f4, $f24 \n\t"
"pmaxsh $f6, $f6, $f26 \n\t"
"gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
"pminsh $f24, $f24, $f4 \n\t"
"pminsh $f26, $f26, $f6 \n\t"
"gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
"psubh $f4, $f8, $f12 \n\t"
"psubh $f6, $f10, $f14 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
"pcmpgth $f24, $f16, $f4 \n\t"
"pcmpgth $f26, $f18, $f6 \n\t"
"gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
"psubh $f4, $f4, $f8 \n\t"
"psubh $f6, $f6, $f10 \n\t"
WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
"pcmpgth $f28, $f28, $f4 \n\t"
"pcmpgth $f30, $f30, $f6 \n\t"
"gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
"and $f24, $f24, $f28 \n\t"
"and $f26, $f26, $f30 \n\t"
"gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"psubh $f20, $f20, $f12 \n\t"
"psubh $f22, $f22, $f14 \n\t"
WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
"pcmpgth $f4, $f4, $f20 \n\t"
"pcmpgth $f6, $f6, $f22 \n\t"
"gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t"
"gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t"
"psubh $f20, $f20, $f0 \n\t"
"psubh $f22, $f22, $f2 \n\t"
"and $f24, $f24, $f4 \n\t"
"and $f26, $f26, $f6 \n\t"
"gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t"
"and $f24, $f24, $f0 \n\t"
"and $f26, $f26, $f2 \n\t"
"gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
"and $f4, $f4, $f24 \n\t"
"and $f6, $f6, $f26 \n\t"
"gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
"gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
"gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t"
"dmtc1 $12, $f0 \n\t"
"psubh $f24, $f24, $f4 \n\t"
"psubh $f26, $f26, $f6 \n\t"
"psllh $f24, $f24, $f0 \n\t"
"psllh $f26, $f26, $f0 \n\t"
"paddh $f24, $f24, $f20 \n\t"
"paddh $f26, $f26, $f22 \n\t"
"gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
"paddh $f24, $f24, $f0 \n\t"
"paddh $f26, $f26, $f2 \n\t"
"dmtc1 %[iBeta], $f2 \n\t"
"dmtc1 $13, $f0 \n\t"
"gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t"
"psrah $f24, $f24, $f0 \n\t"
"psrah $f26, $f26, $f0 \n\t"
"dmtc1 %[iAlpha], $f0 \n\t"
"pmaxsh $f20, $f20, $f24 \n\t"
"pmaxsh $f22, $f22, $f26 \n\t"
"pminsh $f0, $f0, $f20 \n\t"
"pminsh $f2, $f2, $f22 \n\t"
"dmfc1 %[iAlpha], $f0 \n\t"
"dmfc1 %[iBeta], $f2 \n\t"
"gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t"
"psubh $f24, $f4, $f20 \n\t"
"psubh $f26, $f6, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
"pcmpgth $f16, $f16, $f24 \n\t"
"pcmpgth $f18, $f18, $f26 \n\t"
"gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
"psubh $f24, $f24, $f4 \n\t"
"psubh $f26, $f26, $f6 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
"pcmpgth $f28, $f28, $f24 \n\t"
"pcmpgth $f30, $f30, $f26 \n\t"
"gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
"and $f16, $f16, $f28 \n\t"
"and $f18, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
"psubh $f24, $f24, $f20 \n\t"
"psubh $f26, $f26, $f22 \n\t"
WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
"pcmpgth $f28, $f28, $f24 \n\t"
"pcmpgth $f30, $f30, $f26 \n\t"
"and $f16, $f16, $f28 \n\t"
"and $f18, $f18, $f30 \n\t"
"gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
"dmtc1 %[iAlpha], $f0 \n\t"
"dmtc1 %[iBeta], $f2 \n\t"
"and $f16, $f16, $f28 \n\t"
"and $f18, $f18, $f30 \n\t"
"and $f0, $f0, $f16 \n\t"
"and $f2, $f2, $f18 \n\t"
"gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
"paddh $f8, $f8, $f16 \n\t"
"paddh $f10, $f10, $f18 \n\t"
"paddh $f4, $f4, $f0 \n\t"
"paddh $f6, $f6, $f2 \n\t"
"psubh $f12, $f12, $f16 \n\t"
"psubh $f14, $f14, $f18 \n\t"
"psubh $f20, $f20, $f0 \n\t"
"psubh $f22, $f22, $f2 \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f4, $f6 \n\t"
"packushb $f12, $f12, $f14 \n\t"
"packushb $f14, $f20, $f22 \n\t"
"gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t"
"gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
"daddiu $11, %[tmp], 0x70 \n\t"
"gslqc1 $f2, $f0, 0x0($11) \n\t"
"gslqc1 $f6, $f4, 0x10($11) \n\t"
"gslqc1 $f10, $f8, 0x20($11) \n\t"
"gslqc1 $f14, $f12, 0x30($11) \n\t"
"punpcklbh $f24, $f2, $f6 \n\t"
"punpckhbh $f26, $f2, $f6 \n\t"
"punpckhbh $f2, $f0, $f4 \n\t"
"punpcklbh $f0, $f0, $f4 \n\t"
"punpcklbh $f28, $f10, $f14 \n\t"
"punpckhbh $f30, $f10, $f14 \n\t"
"punpckhbh $f10, $f8, $f12 \n\t"
"punpcklbh $f8, $f8, $f12 \n\t"
"punpcklhw $f16, $f2, $f10 \n\t"
"punpckhhw $f18, $f2, $f10 \n\t"
"punpckhhw $f2, $f0, $f8 \n\t"
"punpcklhw $f0, $f0, $f8 \n\t"
"punpcklhw $f20, $f26, $f30 \n\t"
"punpckhhw $f22, $f26, $f30 \n\t"
"punpckhhw $f26, $f24, $f28 \n\t"
"punpcklhw $f24, $f24, $f28 \n\t"
"punpcklwd $f4, $f2, $f26 \n\t"
"punpckhwd $f6, $f2, $f26 \n\t"
"punpckhwd $f2, $f0, $f24 \n\t"
"punpcklwd $f0, $f0, $f24 \n\t"
"punpcklwd $f8, $f18, $f22 \n\t"
"punpckhwd $f10, $f18, $f22 \n\t"
"punpckhwd $f18, $f16, $f20 \n\t"
"punpcklwd $f16, $f16, $f20 \n\t"
"mov.d $f20, $f2 \n\t"
"mov.d $f22, $f18 \n\t"
"mov.d $f2, $f16 \n\t"
"mov.d $f24, $f6 \n\t"
"mov.d $f26, $f10 \n\t"
"mov.d $f6, $f8 \n\t"
"dli %[iAlpha], 0x20 \n\t"
"daddu $8, %[pPixCb], %[iStride] \n\t"
"gsswlc1 $f0, 0x3(%[pPixCb]) \n\t"
"gsswlc1 $f20, 0x3($8) \n\t"
"gsswrc1 $f0, 0x0(%[pPixCb]) \n\t"
"gsswrc1 $f20, 0x0($8) \n\t"
"daddu $9, $8, %[iStride] \n\t"
"daddu $8, $9, %[iStride] \n\t"
"gsswlc1 $f4, 0x3($9) \n\t"
"gsswlc1 $f24, 0x3($8) \n\t"
"gsswrc1 $f4, 0x0($9) \n\t"
"gsswrc1 $f24, 0x0($8) \n\t"
"daddu $9, $8, %[iStride] \n\t"
"dmtc1 %[iAlpha], $f8 \n\t"
"dsrl $f0, $f0, $f8 \n\t"
"dsrl $f20, $f20, $f8 \n\t"
"dsrl $f4, $f4, $f8 \n\t"
"dsrl $f24, $f24, $f8 \n\t"
"daddu $10, %[pPixCr], %[iStride] \n\t"
"gsswlc1 $f0, 0x3(%[pPixCr]) \n\t"
"gsswlc1 $f20, 0x3($10) \n\t"
"gsswrc1 $f0, 0x0(%[pPixCr]) \n\t"
"gsswrc1 $f20, 0x0($10) \n\t"
"daddu $11, $10, %[iStride] \n\t"
"daddu $10, $11, %[iStride] \n\t"
"gsswlc1 $f4, 0x3($11) \n\t"
"gsswlc1 $f24, 0x3($10) \n\t"
"gsswrc1 $f4, 0x0($11) \n\t"
"gsswrc1 $f24, 0x0($10) \n\t"
"daddu $11, $10, %[iStride] \n\t"
"daddu $8, $9, %[iStride] \n\t"
"gsswlc1 $f2, 0x3($9) \n\t"
"gsswlc1 $f22, 0x3($8) \n\t"
"gsswrc1 $f2, 0x0($9) \n\t"
"gsswrc1 $f22, 0x0($8) \n\t"
"daddu $9, $8, %[iStride] \n\t"
"daddu $8, $9, %[iStride] \n\t"
"gsswlc1 $f6, 0x3($9) \n\t"
"gsswlc1 $f26, 0x3($8) \n\t"
"gsswrc1 $f6, 0x0($9) \n\t"
"gsswrc1 $f26, 0x0($8) \n\t"
"dsrl $f2, $f2, $f8 \n\t"
"dsrl $f22, $f22, $f8 \n\t"
"dsrl $f6, $f6, $f8 \n\t"
"dsrl $f26, $f26, $f8 \n\t"
"daddu $10, $11, %[iStride] \n\t"
"gsswlc1 $f2, 0x3($11) \n\t"
"gsswlc1 $f22, 0x3($10) \n\t"
"gsswrc1 $f2, 0x0($11) \n\t"
"gsswrc1 $f22, 0x0($10) \n\t"
"daddu $11, $10, %[iStride] \n\t"
"daddu $10, $11, %[iStride] \n\t"
"gsswlc1 $f6, 0x3($11) \n\t"
"gsswlc1 $f26, 0x3($10) \n\t"
"gsswrc1 $f6, 0x0($11) \n\t"
"gsswrc1 $f26, 0x0($10) \n\t"
: [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
: [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
[iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
"$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
"$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
__asm__ volatile(
".set arch=loongson3a \n\t"
"gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
"gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
"gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
"gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
"gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
"gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
"pcmpeqh $f8, $f8, $f8 \n\t"
"dli $8, 0xF \n\t"
"dmtc1 $8, $f6 \n\t"
"psrlh $f8, $f8, $f6 \n\t"
"packushb $f8, $f8, $f8 \n\t"
"pminub $f0, $f0, $f8 \n\t"
"pminub $f2, $f2, $f8 \n\t"
"pminub $f4, $f4, $f8 \n\t"
"gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
"gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
"gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
"gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
"gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
"gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
:
: [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
);
}