ref: 77406e6a66ae9c58b5ef3e7a37ac2714b8a0db9f
dir: /codec/encoder/core/mips/quant_mmi.c/
/*!
* \copy
* Copyright (c) 2009-2018, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* \file quant_mmi.c
*
* \brief Loongson optimization
*
* \date 20/07/2018 Created
*
*************************************************************************************
*/
#include <stdint.h>
#include "asmdefs_mmi.h"
void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
__asm__ volatile (
".set arch=loongson3a \n\t"
"xor $f10, $f10, $f10 \n\t"
"gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
"gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
"gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
:
: [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
: "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
);
}
void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
__asm__ volatile (
".set arch=loongson3a \n\t"
"xor $f10, $f10, $f10 \n\t"
"dmtc1 %[mf], $f12 \n\t"
"pshufh $f12, $f12, $f10 \n\t"
"dmtc1 %[ff], $f8 \n\t"
"pshufh $f8, $f8, $f10 \n\t"
"gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f8 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f12 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f8 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f12 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
:
: [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
: "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
);
}
void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
__asm__ volatile (
".set arch=loongson3a \n\t"
"gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
"gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
"gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
:
: [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
: "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
);
}
void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
const int16_t *mf, int16_t *max) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"gslqc1 $f10, $f8, 0x0(%[ff]) \n\t"
"gslqc1 $f14, $f12, 0x0(%[mf]) \n\t"
"xor $f16, $f16, $f16 \n\t"
"xor $f18, $f18, $f18 \n\t"
"xor $f20, $f20, $f20 \n\t"
"xor $f22, $f22, $f22 \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
"gslqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f16, $f16, $f0 \n\t"
"pmaxsh $f18, $f18, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f16, $f16, $f0 \n\t"
"pmaxsh $f18, $f18, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x10(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f20, $f20, $f0 \n\t"
"pmaxsh $f22, $f22, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x20(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f20, $f20, $f0 \n\t"
"pmaxsh $f22, $f22, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x30(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f24, $f24, $f0 \n\t"
"pmaxsh $f26, $f26, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x40(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f24, $f24, $f0 \n\t"
"pmaxsh $f26, $f26, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x50(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f28, $f28, $f0 \n\t"
"pmaxsh $f30, $f30, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x60(%[pDct]) \n\t"
"gslqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"xor $f6, $f6, $f6 \n\t"
"pcmpgth $f4, $f4, $f0 \n\t"
"pcmpgth $f6, $f6, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"paddush $f0, $f0, $f8 \n\t"
"paddush $f2, $f2, $f10 \n\t"
"pmulhuh $f0, $f0, $f12 \n\t"
"pmulhuh $f2, $f2, $f14 \n\t"
"pmaxsh $f28, $f28, $f0 \n\t"
"pmaxsh $f30, $f30, $f2 \n\t"
"xor $f0, $f0, $f4 \n\t"
"xor $f2, $f2, $f6 \n\t"
"psubh $f0, $f0, $f4 \n\t"
"psubh $f2, $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x70(%[pDct]) \n\t"
"mov.d $f0, $f18 \n\t"
"punpckhhw $f18, $f16, $f20 \n\t"
"punpcklhw $f16, $f16, $f20 \n\t"
"punpckhhw $f2, $f0, $f22 \n\t"
"punpcklhw $f0, $f0, $f22 \n\t"
"mov.d $f20, $f26 \n\t"
"punpckhhw $f26, $f24, $f28 \n\t"
"punpcklhw $f24, $f24, $f28 \n\t"
"punpckhhw $f22, $f20, $f30 \n\t"
"punpcklhw $f20, $f20, $f30 \n\t"
"mov.d $f28, $f18 \n\t"
"punpckhwd $f18, $f16, $f24 \n\t"
"punpcklwd $f16, $f16, $f24 \n\t"
"punpckhwd $f30, $f28, $f26 \n\t"
"punpcklwd $f28, $f28, $f26 \n\t"
"mov.d $f24, $f2 \n\t"
"punpckhwd $f2, $f0, $f20 \n\t"
"punpcklwd $f0, $f0, $f20 \n\t"
"punpckhwd $f26, $f24, $f22 \n\t"
"punpcklwd $f24, $f24, $f22 \n\t"
"mov.d $f20, $f18 \n\t"
"mov.d $f18, $f0 \n\t"
"mov.d $f22, $f2 \n\t"
"mov.d $f0, $f30 \n\t"
"mov.d $f30, $f24 \n\t"
"mov.d $f2, $f26 \n\t"
"pmaxsh $f0, $f0, $f16 \n\t"
"pmaxsh $f2, $f2, $f18 \n\t"
"pmaxsh $f0, $f0, $f20 \n\t"
"pmaxsh $f2, $f2, $f22 \n\t"
"pmaxsh $f0, $f0, $f28 \n\t"
"pmaxsh $f2, $f2, $f30 \n\t"
"mov.d $f4, $f0 \n\t"
"mov.d $f6, $f2 \n\t"
"mov.d $f0, $f2 \n\t"
"mov.d $f2, $f6 \n\t"
"pmaxsh $f0, $f0, $f4 \n\t"
"pmaxsh $f2, $f2, $f6 \n\t"
"gssdlc1 $f0, 0x7(%[max]) \n\t"
"gssdrc1 $f0, 0x0(%[max]) \n\t"
:
: [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
[max]"r"((short *)max)
: "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
"$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}