ref: 0fc550221585d252f2beb3b6720469875673910b
dir: /codec/encoder/core/mips/quant_mmi.c/
/*! * \copy * Copyright (c) 2009-2018, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * * \file quant_mmi.c * * \brief Loongson optimization * * \date 20/07/2018 Created * ************************************************************************************* */ #include <stdint.h> #include "asmdefs_mmi.h" void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) { __asm__ volatile ( ".set arch=loongson3a \n\t" "xor $f10, $f10, $f10 \n\t" "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t" "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t" : : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf) : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14" ); } void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) { __asm__ volatile ( ".set arch=loongson3a \n\t" "xor $f10, $f10, $f10 \n\t" "dmtc1 %[mf], $f12 \n\t" "pshufh $f12, $f12, $f10 \n\t" "dmtc1 %[ff], $f8 \n\t" "pshufh $f8, $f8, $f10 \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f8 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f12 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f8 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f12 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t" : : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf) : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12" ); } void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) { __asm__ volatile ( ".set arch=loongson3a \n\t" "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t" "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t" : : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf) : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14" ); } void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff, const int16_t *mf, int16_t *max) { BACKUP_REG; __asm__ volatile ( ".set arch=loongson3a \n\t" "gslqc1 $f10, $f8, 0x0(%[ff]) \n\t" "gslqc1 $f14, $f12, 0x0(%[mf]) \n\t" "xor $f16, $f16, $f16 \n\t" "xor $f18, $f18, $f18 \n\t" "xor $f20, $f20, $f20 \n\t" "xor $f22, $f22, $f22 \n\t" "xor $f24, $f24, $f24 \n\t" "xor $f26, $f26, $f26 \n\t" "xor $f28, $f28, $f28 \n\t" "xor $f30, $f30, $f30 \n\t" "gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f16, $f16, $f0 \n\t" "pmaxsh $f18, $f18, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f16, $f16, $f0 \n\t" "pmaxsh $f18, $f18, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f20, $f20, $f0 \n\t" "pmaxsh $f22, $f22, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f20, $f20, $f0 \n\t" "pmaxsh $f22, $f22, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f24, $f24, $f0 \n\t" "pmaxsh $f26, $f26, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f24, $f24, $f0 \n\t" "pmaxsh $f26, $f26, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f28, $f28, $f0 \n\t" "pmaxsh $f30, $f30, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t" "gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t" "xor $f4, $f4, $f4 \n\t" "xor $f6, $f6, $f6 \n\t" "pcmpgth $f4, $f4, $f0 \n\t" "pcmpgth $f6, $f6, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "paddush $f0, $f0, $f8 \n\t" "paddush $f2, $f2, $f10 \n\t" "pmulhuh $f0, $f0, $f12 \n\t" "pmulhuh $f2, $f2, $f14 \n\t" "pmaxsh $f28, $f28, $f0 \n\t" "pmaxsh $f30, $f30, $f2 \n\t" "xor $f0, $f0, $f4 \n\t" "xor $f2, $f2, $f6 \n\t" "psubh $f0, $f0, $f4 \n\t" "psubh $f2, $f2, $f6 \n\t" "gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t" "mov.d $f0, $f18 \n\t" "punpckhhw $f18, $f16, $f20 \n\t" "punpcklhw $f16, $f16, $f20 \n\t" "punpckhhw $f2, $f0, $f22 \n\t" "punpcklhw $f0, $f0, $f22 \n\t" "mov.d $f20, $f26 \n\t" "punpckhhw $f26, $f24, $f28 \n\t" "punpcklhw $f24, $f24, $f28 \n\t" "punpckhhw $f22, $f20, $f30 \n\t" "punpcklhw $f20, $f20, $f30 \n\t" "mov.d $f28, $f18 \n\t" "punpckhwd $f18, $f16, $f24 \n\t" "punpcklwd $f16, $f16, $f24 \n\t" "punpckhwd $f30, $f28, $f26 \n\t" "punpcklwd $f28, $f28, $f26 \n\t" "mov.d $f24, $f2 \n\t" "punpckhwd $f2, $f0, $f20 \n\t" "punpcklwd $f0, $f0, $f20 \n\t" "punpckhwd $f26, $f24, $f22 \n\t" "punpcklwd $f24, $f24, $f22 \n\t" "mov.d $f20, $f18 \n\t" "mov.d $f18, $f0 \n\t" "mov.d $f22, $f2 \n\t" "mov.d $f0, $f30 \n\t" "mov.d $f30, $f24 \n\t" "mov.d $f2, $f26 \n\t" "pmaxsh $f0, $f0, $f16 \n\t" "pmaxsh $f2, $f2, $f18 \n\t" "pmaxsh $f0, $f0, $f20 \n\t" "pmaxsh $f2, $f2, $f22 \n\t" "pmaxsh $f0, $f0, $f28 \n\t" "pmaxsh $f2, $f2, $f30 \n\t" "mov.d $f4, $f0 \n\t" "mov.d $f6, $f2 \n\t" "mov.d $f0, $f2 \n\t" "mov.d $f2, $f6 \n\t" "pmaxsh $f0, $f0, $f4 \n\t" "pmaxsh $f2, $f2, $f6 \n\t" "gssdlc1 $f0, 0x7(%[max]) \n\t" "gssdrc1 $f0, 0x0(%[max]) \n\t" : : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf), [max]"r"((short *)max) : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" ); RECOVER_REG; }