ref: cabcddda2aa8d202f6ed12466331f9cc9de86dc0
dir: /codec/processing/src/mips/vaa_mmi.c/
/*!
* \copy
* Copyright (c) 2009-2018, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* \file vaa_mmi.c
*
* \brief Loongson optimization
*
* \date 23/07/2018 Created
*
*************************************************************************************
*/
#include <stdint.h>
#include "asmdefs_mmi.h"
//f4 is 0x1, f6 is 0x8
#define WELS_MAX_REG_MMI(f0, f2, f4, f6) \
"punpckhwd $f4, "#f0", "#f0" \n\t" \
"punpckhwd $f6, "#f2", "#f2" \n\t" \
"pmaxub "#f0", "#f0", $f4 \n\t" \
"pmaxub "#f2", "#f2", $f6 \n\t" \
"pshufh $f4, "#f0", "#f4" \n\t" \
"pshufh $f6, "#f2", "#f4" \n\t" \
"pmaxub "#f0", "#f0", $f4 \n\t" \
"pmaxub "#f2", "#f2", $f6 \n\t" \
"dsrl $f4, "#f0", "#f6" \n\t" \
"dsrl $f6, "#f2", "#f6" \n\t" \
"pmaxub "#f0", "#f0", $f4 \n\t" \
"pmaxub "#f2", "#f2", $f6 \n\t"
#define WELS_SAD_SD_MAD_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
"gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
"gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
"pasubub $f12, $f4, $f0 \n\t" \
"pasubub $f14, $f6, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw "#f4", "#f4", $f12 \n\t" \
"paddw "#f6", "#f6", $f14 \n\t" \
"pasubub $f12, $f8, $f0 \n\t" \
"pasubub $f14, $f10, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw "#f8", "#f8", $f12 \n\t" \
"paddw "#f10", "#f10", $f14 \n\t" \
"pasubub $f12, $f4, $f8 \n\t" \
"pasubub $f14, $f6, $f10 \n\t" \
"pmaxub "#f12", "#f12", $f12 \n\t" \
"pmaxub "#f14", "#f14", $f14 \n\t" \
"pasubub $f12, $f12, $f0 \n\t" \
"pasubub $f14, $f14, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw "#f0", "#f0", $f12 \n\t" \
"paddw "#f2", "#f2", $f14 \n\t" \
PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
#define WELS_SAD_16x2_MMI(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, r1, r2, r3) \
"gslqc1 "#f1", "#f2", 0x00("#r1") \n\t" \
"gslqc1 "#f3", "#f4", 0x00("#r2") \n\t" \
PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
"gslqc1 "#f5", "#f6", 0x00("#r1") \n\t" \
PTR_ADDU ""#r2", "#r2", "#r3" \n\t" \
"gslqc1 "#f7", "#f8", 0x00("#r2") \n\t" \
"pasubub "#f1", "#f1", "#f3" \n\t" \
"pasubub "#f2", "#f2", "#f4" \n\t" \
"biadd "#f1", "#f1" \n\t" \
"biadd "#f2", "#f2" \n\t" \
"pasubub "#f5", "#f5", "#f7" \n\t" \
"pasubub "#f6", "#f6", "#f8" \n\t" \
"biadd "#f5", "#f5" \n\t" \
"biadd "#f6", "#f6" \n\t" \
"paddw "#f9", "#f9", "#f1" \n\t" \
"paddw "#f9", "#f9", "#f5" \n\t" \
"paddw "#f10", "#f10", "#f2" \n\t" \
"paddw "#f10", "#f10", "#f6" \n\t" \
PTR_ADDU ""#r1", "#r1", "#r3" \n\t" \
PTR_ADDU ""#r2", "#r2", "#r3" \n\t"
#define WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI(r0, r1, r2) \
"gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
"gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
"pasubub $f12, $f4, $f8 \n\t" \
"pasubub $f14, $f6, $f10 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw $f28, $f28, $f12 \n\t" \
"paddw $f30, $f30, $f14 \n\t" \
"pasubub $f12, $f4, $f8 \n\t" \
"pasubub $f14, $f6, $f10 \n\t" \
"pasubub $f8, $f4, $f0 \n\t" \
"pasubub $f10, $f6, $f2 \n\t" \
"biadd $f8, $f8 \n\t" \
"biadd $f10, $f10 \n\t" \
"paddw $f24, $f24, $f8 \n\t" \
"paddw $f26, $f26, $f10 \n\t" \
"punpcklbh $f8, $f6, $f2 \n\t" \
"punpckhbh $f10, $f6, $f2 \n\t" \
"punpckhbh $f6, $f4, $f0 \n\t" \
"punpcklbh $f4, $f4, $f0 \n\t" \
"pmaddhw $f4, $f4, $f4 \n\t" \
"pmaddhw $f6, $f6, $f6 \n\t" \
"pmaddhw $f8, $f8, $f8 \n\t" \
"pmaddhw $f10, $f10, $f10 \n\t" \
"paddw $f20, $f20, $f4 \n\t" \
"paddw $f22, $f22, $f6 \n\t" \
"paddw $f20, $f20, $f8 \n\t" \
"paddw $f22, $f22, $f10 \n\t" \
"punpcklbh $f4, $f12, $f0 \n\t" \
"punpckhbh $f6, $f12, $f0 \n\t" \
"punpcklbh $f12, $f14, $f2 \n\t" \
"punpckhbh $f14, $f14, $f2 \n\t" \
"pmaddhw $f4, $f4, $f4 \n\t" \
"pmaddhw $f6, $f6, $f6 \n\t" \
"pmaddhw $f12, $f12, $f12 \n\t" \
"pmaddhw $f14, $f14, $f14 \n\t" \
"paddw $f16, $f16, $f4 \n\t" \
"paddw $f18, $f18, $f6 \n\t" \
"paddw $f16, $f16, $f12 \n\t" \
"paddw $f18, $f18, $f14 \n\t" \
PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
#define WELS_SAD_BGD_SQDIFF_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
"gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
"punpcklbh $f8, $f4, $f0 \n\t" \
"punpckhbh $f10, $f4, $f0 \n\t" \
"punpcklbh $f12, $f6, $f2 \n\t" \
"punpckhbh $f14, $f6, $f2 \n\t" \
"pmaddhw $f8, $f8, $f8 \n\t" \
"pmaddhw $f10, $f10, $f10 \n\t" \
"pmaddhw $f12, $f12, $f12 \n\t" \
"pmaddhw $f14, $f14, $f14 \n\t" \
"paddw $f8, $f8, $f12 \n\t" \
"paddw $f10, $f10, $f14 \n\t" \
"punpckhwd $f12, $f0, $f8 \n\t" \
"punpckhwd $f14, $f0, $f10 \n\t" \
"punpcklwd $f8, $f0, $f8 \n\t" \
"punpcklwd $f10, $f0, $f10 \n\t" \
"paddw $f8, $f8, $f12 \n\t" \
"paddw $f10, $f10, $f14 \n\t" \
"paddw "#f0", "#f0", $f8 \n\t" \
"paddw "#f2", "#f2", $f10 \n\t" \
"gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
"pasubub $f12, $f4, $f0 \n\t" \
"pasubub $f14, $f6, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw "#f4", "#f4", $f12 \n\t" \
"paddw "#f6", "#f6", $f14 \n\t" \
"pasubub $f12, $f8, $f0 \n\t" \
"pasubub $f14, $f10, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"punpcklwd $f14, $f14, $f14 \n\t" \
"punpckhwd $f14, $f12, $f14 \n\t" \
"punpcklwd $f12, $f0, $f12 \n\t" \
"paddw "#f4", "#f4", $f12 \n\t" \
"paddw "#f6", "#f6", $f14 \n\t" \
"pasubub $f12, $f4, $f8 \n\t" \
"pasubub $f14, $f6, $f10 \n\t" \
"pmaxub "#f8", "#f8", $f12 \n\t" \
"pmaxub "#f10", "#f10", $f14 \n\t" \
"paddw $f4, $f0, $f12 \n\t" \
"paddw $f6, $f0, $f14 \n\t" \
"pasubub $f12, $f12, $f0 \n\t" \
"pasubub $f14, $f14, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw "#f0", "#f0", $f12 \n\t" \
"paddw "#f2", "#f2", $f14 \n\t" \
"paddw $f12, $f0, $f4 \n\t" \
"paddw $f14, $f0, $f6 \n\t" \
"punpcklbh $f4, $f12, $f0 \n\t" \
"punpckhbh $f6, $f12, $f0 \n\t" \
"punpcklbh $f12, $f14, $f2 \n\t" \
"punpckhbh $f14, $f14, $f2 \n\t" \
"pmaddhw $f4, $f4, $f4 \n\t" \
"pmaddhw $f6, $f6, $f6 \n\t" \
"pmaddhw $f12, $f12, $f12 \n\t" \
"pmaddhw $f14, $f14, $f14 \n\t" \
"paddw "#f12", "#f12", $f4 \n\t" \
"paddw "#f14", "#f14", $f6 \n\t" \
"paddw "#f12", "#f12", $f12 \n\t" \
"paddw "#f14", "#f14", $f14 \n\t" \
PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
#define WELS_SAD_SUM_SQSUM_16x1_MMI(r0, r1, r2) \
"gslqc1 $f6, $f4, 0x0("#r0") \n\t" \
"gslqc1 $f10, $f8, 0x0("#r1") \n\t" \
"pasubub $f12, $f4, $f8 \n\t" \
"pasubub $f14, $f6, $f10 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw $f24, $f24, $f12 \n\t" \
"paddw $f26, $f26, $f14 \n\t" \
"pasubub $f12, $f4, $f0 \n\t" \
"pasubub $f14, $f6, $f2 \n\t" \
"biadd $f12, $f12 \n\t" \
"biadd $f14, $f14 \n\t" \
"paddw $f20, $f20, $f12 \n\t" \
"paddw $f22, $f22, $f14 \n\t" \
"punpcklbh $f8, $f6, $f2 \n\t" \
"punpckhbh $f10, $f6, $f2 \n\t" \
"punpckhbh $f6, $f4, $f0 \n\t" \
"punpcklbh $f4, $f4, $f0 \n\t" \
"pmaddhw $f4, $f4, $f4 \n\t" \
"pmaddhw $f6, $f6, $f6 \n\t" \
"pmaddhw $f8, $f8, $f8 \n\t" \
"pmaddhw $f10, $f10, $f10 \n\t" \
"paddw $f16, $f16, $f4 \n\t" \
"paddw $f18, $f18, $f6 \n\t" \
"paddw $f16, $f16, $f8 \n\t" \
"paddw $f18, $f18, $f10 \n\t" \
PTR_ADDU ""#r0", "#r0", "#r2" \n\t" \
PTR_ADDU ""#r1", "#r1", "#r2" \n\t"
void VAACalcSad_mmi(const uint8_t* pCurData, const uint8_t* pRefData,
int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
int32_t* pFrameSad, int32_t* pSad8x8) {
double ftmp[13];
uint64_t tmp[2];
mips_reg addr[3];
__asm__ volatile (
".set arch=loongson3a \n\t"
PTR_SRL "%[iPicWidth], %[iPicWidth], 0x04 \n\t"
PTR_SRL "%[iPicHeight], %[iPicHeight], 0x04 \n\t"
"move %[addr2], %[iPicStride] \n\t"
PTR_SLL "%[iPicStride], %[iPicStride], 0x04 \n\t"
"xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
"xor %[ftmp11], %[ftmp11], %[ftmp11] \n\t"
"xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t"
"1: \n\t"
"move %[addr0], %[pCurData] \n\t"
"move %[addr1], %[pRefData] \n\t"
"move %[tmp0], %[iPicWidth] \n\t"
"2: \n\t"
"xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
"xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
"paddw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
"paddw %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
"swc1 %[ftmp10], 0x00(%[pSad8x8]) \n\t"
"swc1 %[ftmp9], 0x04(%[pSad8x8]) \n\t"
"xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t"
"xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t"
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
%[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
%[addr0], %[addr1], %[addr2])
"paddw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
"paddw %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
"swc1 %[ftmp10], 0x08(%[pSad8x8]) \n\t"
"swc1 %[ftmp9], 0x0c(%[pSad8x8]) \n\t"
PTR_ADDU "%[pSad8x8], %[pSad8x8], 0x10 \n\t"
PTR_SUBU "%[addr0], %[addr0], %[iPicStride] \n\t"
PTR_SUBU "%[addr1], %[addr1], %[iPicStride] \n\t"
PTR_ADDI "%[tmp0], %[tmp0], -0x01 \n\t"
PTR_ADDU "%[addr0], %[addr0], 0x10 \n\t"
PTR_ADDU "%[addr1], %[addr1], 0x10 \n\t"
"bnez %[tmp0], 2b \n\t"
PTR_ADDI "%[iPicHeight], %[iPicHeight], -0x01 \n\t"
PTR_ADDU "%[pCurData], %[pCurData], %[iPicStride] \n\t"
PTR_ADDU "%[pRefData], %[pRefData], %[iPicStride] \n\t"
"bnez %[iPicHeight], 1b \n\t"
"paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
"swc1 %[ftmp11], 0x00(%[pFrameSad]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
[ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]),
[addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
[pCurData]"+&r"(pCurData), [pRefData]"+&r"(pRefData),
[iPicHeight]"+&r"(iPicHeight), [iPicWidth]"+&r"(iPicWidth),
[pSad8x8]"+&r"(pSad8x8), [iPicStride]"+&r"(iPicStride),
[addr2]"=&r"(addr[2])
: [pFrameSad]"r"(pFrameSad)
: "memory"
);
}
void VAACalcSadBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8,
uint8_t *p_mad8x8) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"move $15, %[cur_data] \n\t"
"dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
"dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
"dsll $13, %[iPicStride], 0x4 \n\t"
"xor $f0, $f0, $f0 \n\t"
"xor $f2, $f2, $f2 \n\t"
"xor $14, $14, $14 \n\t"
"1: \n\t"
"move $9, %[iPicWidth] \n\t"
"move $10, $15 \n\t"
"move $11, %[ref_data] \n\t"
"2: \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
"xor $f20, $f20, $f20 \n\t"
"xor $f22, $f22, $f22 \n\t"
"xor $f16, $f16, $f16 \n\t"
"xor $f18, $f18, $f18 \n\t"
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
"dli $8, 0x1 \n\t"
"dmtc1 $8, $f8 \n\t"
"dli $8, 0x8 \n\t"
"dmtc1 $8, $f10 \n\t"
WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
"dmfc1 $8, $f16 \n\t"
"sb $8, 0x0(%[p_mad8x8]) \n\t"
"dmfc1 $8, $f18 \n\t"
"sb $8, 0x1(%[p_mad8x8]) \n\t"
PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
"xor $f16, $f16, $f16 \n\t"
"xor $f18, $f18, $f18 \n\t"
"punpcklwd $f30, $f30, $f30 \n\t"
"punpcklwd $f26, $f26, $f26 \n\t"
"punpcklwd $f22, $f22, $f22 \n\t"
"punpckhwd $f30, $f28, $f30 \n\t"
"punpckhwd $f26, $f24, $f26 \n\t"
"punpckhwd $f22, $f20, $f22 \n\t"
"punpcklwd $f28, $f16, $f28 \n\t"
"punpcklwd $f24, $f16, $f24 \n\t"
"punpcklwd $f20, $f16, $f20 \n\t"
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
$15, %[ref_data], %[iPicStride])
"dli $8, 0x1 \n\t"
"dmtc1 $8, $f8 \n\t"
"dli $8, 0x8 \n\t"
"dmtc1 $8, $f10 \n\t"
WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)
"dmfc1 $8, $f16 \n\t"
"sb $8, 0x0(%[p_mad8x8]) \n\t"
"dmfc1 $8, $f18 \n\t"
"sb $8, 0x1(%[p_mad8x8]) \n\t"
"punpckhwd $f4, $f28, $f30 \n\t"
PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
"punpcklwd $f6, $f28, $f30 \n\t"
"gssqc1 $f6, $f4, 0x0(%[psad8x8]) \n\t"
PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
"paddw $f6, $f6, $f30 \n\t"
"paddw $f4, $f4, $f28 \n\t"
"punpckhwd $f8, $f6, $f6 \n\t"
"paddw $f4, $f4, $f8 \n\t"
"dmtc1 $14, $f6 \n\t"
"paddw $f6, $f6, $f4 \n\t"
"dmfc1 $14, $f6 \n\t"
"psubw $f24, $f24, $f20 \n\t"
"psubw $f26, $f26, $f22 \n\t"
"punpckhwd $f4, $f24, $f26 \n\t"
"punpcklwd $f6, $f24, $f26 \n\t"
"gssqc1 $f6, $f4, 0x0(%[p_sd8x8]) \n\t"
PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x10 \n\t"
PTR_SUBU "$15, $15, $13 \n\t"
PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "$15, $15, 0x10 \n\t"
PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
"bnez %[iPicWidth], 2b \n\t"
"move %[iPicWidth], $9 \n\t"
"move $15, $10 \n\t"
"move %[ref_data], $11 \n\t"
PTR_ADDU "$15, $15, $13 \n\t"
PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
"bnez %[iPicHeight], 1b \n\t"
"swl $14, 0x3(%[psadframe]) \n\t"
"swr $14, 0x0(%[psadframe]) \n\t"
: [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
[iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
[p_sd8x8]"+&r"((int *)p_sd8x8), [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
: [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
[psadframe]"r"((int *)psadframe)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
"$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
"$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void VAACalcSadSsd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
int32_t *psqsum16x16, int32_t *psqdiff16x16) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"move $15, %[cur_data] \n\t"
"dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
"dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
"dsll $13, %[iPicStride], 0x4 \n\t"
"xor $f0, $f0, $f0 \n\t"
"xor $f2, $f2, $f2 \n\t"
"xor $12, $12, $12 \n\t"
"xor $14, $14, $14 \n\t"
"1: \n\t"
"move $9, %[iPicWidth] \n\t"
"move $10, $15 \n\t"
"move $11, %[ref_data] \n\t"
"2: \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
"xor $f20, $f20, $f20 \n\t"
"xor $f22, $f22, $f22 \n\t"
"xor $f16, $f16, $f16 \n\t"
"xor $f18, $f18, $f18 \n\t"
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
"dmfc1 $8, $f28 \n\t"
"sw $8, 0x0(%[psad8x8]) \n\t"
"dmfc1 $8, $f30 \n\t"
"sw $8, 0x4(%[psad8x8]) \n\t"
"paddw $f4, $f28, $f30 \n\t"
"dmfc1 $12, $f4 \n\t"
PTR_ADDU "$14, $14, $12 \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
"dmfc1 $8, $f28 \n\t"
"sw $8, 0x8(%[psad8x8]) \n\t"
"dmfc1 $8, $f30 \n\t"
"paddw $f4, $f28, $f30 \n\t"
"sw $8, 0xc(%[psad8x8]) \n\t"
"dmfc1 $12, $f4 \n\t"
PTR_ADDU "$14, $14, $12 \n\t"
PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
"paddw $f24, $f24, $f26 \n\t"
"dmfc1 $8, $f24 \n\t"
"sw $8, 0x0(%[psum16x16]) \n\t"
PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
"paddw $f24, $f20, $f22 \n\t"
"punpcklwd $f20, $f24, $f24 \n\t"
"punpckhwd $f22, $f24, $f24 \n\t"
"paddw $f20, $f20, $f22 \n\t"
"dmfc1 $8, $f20 \n\t"
"sw $8, 0x0(%[psqsum16x16]) \n\t"
PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
"paddw $f20, $f16, $f18 \n\t"
"punpcklwd $f16, $f20, $f20 \n\t"
"punpckhwd $f18, $f20, $f20 \n\t"
"paddw $f16, $f16, $f18 \n\t"
"dmfc1 $8, $f16 \n\t"
"sw $8, 0x0(%[psqdiff16x16]) \n\t"
PTR_ADDIU "%[psqdiff16x16], %[psqdiff16x16], 0x4 \n\t"
PTR_SUBU "$15, $15, $13 \n\t"
PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "$15, $15, 0x10 \n\t"
PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
"bnez %[iPicWidth], 2b \n\t"
"nop \n\t"
"move %[iPicWidth], $9 \n\t"
"move $15, $10 \n\t"
"move %[ref_data], $11 \n\t"
PTR_ADDU "$15, $15, $13 \n\t"
PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
"bnez %[iPicHeight], 1b \n\t"
"nop \n\t"
"sw $14, 0x0(%[psadframe]) \n\t"
: [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
[iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
[psqsum16x16]"+&r"((int *)psqsum16x16), [psqdiff16x16]"+&r"((int *)psqdiff16x16)
: [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
[psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
"$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
"$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void VAACalcSadSsdBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8,
uint8_t *p_mad8x8) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"move $15, %[cur_data] \n\t"
"dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
"dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
"dsll $13, %[iPicStride], 0x4 \n\t"
"xor $f0, $f0, $f0 \n\t"
"xor $f2, $f2, $f2 \n\t"
"xor $12, $12, $12 \n\t"
"xor $14, $14, $14 \n\t"
"1: \n\t"
"move $9, %[iPicWidth] \n\t"
"move $10, $15 \n\t"
"move $11, %[ref_data] \n\t"
"2: \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
"xor $f20, $f20, $f20 \n\t"
"xor $f22, $f22, $f22 \n\t"
"xor $f16, $f16, $f16 \n\t"
"xor $f18, $f18, $f18 \n\t"
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
"dmfc1 $8, $f28 \n\t"
"sw $8, 0x0(%[psad8x8]) \n\t"
"dmfc1 $8, $f30 \n\t"
"sw $8, 0x4(%[psad8x8]) \n\t"
PTR_ADDIU "%[psad8x8], %[psad8x8], 0x8 \n\t"
"paddw $f4, $f28, $f30 \n\t"
"dmfc1 $12, $f4 \n\t"
PTR_ADDU "$14, $14, $12 \n\t"
"paddw $f4, $f24, $f26 \n\t"
"dmfc1 $8, $f4 \n\t"
"sw $8, 0x0(%[psum16x16]) \n\t"
"punpckhwd $f4, $f24, $f26 \n\t"
"punpcklwd $f6, $f24, $f26 \n\t"
"psubw $f6, $f6, $f4 \n\t"
"dmfc1 $8, $f6 \n\t"
PTR_S "$8, 0x0(%[p_sd8x8]) \n\t"
PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x8 \n\t"
"dli $8, 0x1 \n\t"
"dmtc1 $8, $f8 \n\t"
"dli $8, 0x8 \n\t"
"dmtc1 $8, $f10 \n\t"
WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
"dmfc1 $8, $f20 \n\t"
"sb $8, 0x0(%[p_mad8x8]) \n\t"
"dmfc1 $8, $f22 \n\t"
"sb $8, 0x1(%[p_mad8x8]) \n\t"
PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
"xor $f20, $f20, $f20 \n\t"
"xor $f22, $f22, $f22 \n\t"
"punpckhwd $f28, $f20, $f28 \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
"punpckhwd $f30, $f20, $f30 \n\t"
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
$f18, $15, %[ref_data], %[iPicStride])
"dmfc1 $8, $f28 \n\t"
"sw $8, 0x0(%[psad8x8]) \n\t"
"dmfc1 $8, $f30 \n\t"
"sw $8, 0x4(%[psad8x8]) \n\t"
PTR_ADDIU "%[psad8x8], %[psad8x8], 0x8 \n\t"
"paddw $f4, $f28, $f30 \n\t"
"dmfc1 $12, $f4 \n\t"
PTR_ADDU "$14, $14, $12 \n\t"
"paddw $f4, $f24, $f26 \n\t"
"dmfc1 $8, $f4 \n\t"
"lw $12, 0x0(%[psum16x16]) \n\t"
PTR_ADDU "$8, $8, $12 \n\t"
"sw $8, 0x0(%[psum16x16]) \n\t"
"xor $f8, $f8, $f8 \n\t"
PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
"punpckhwd $f30, $f30, $f8 \n\t"
"punpckhwd $f28, $f28, $f8 \n\t"
"paddw $f8, $f28, $f30 \n\t"
"dmfc1 $8, $f8 \n\t"
"sw $8, 0x0(%[psqsum16x16]) \n\t"
PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
"punpckhwd $f4, $f24, $f26 \n\t"
"punpcklwd $f6, $f24, $f26 \n\t"
"psubw $f6, $f6, $f4 \n\t"
"dmfc1 $8, $f6 \n\t"
PTR_S "$8, 0x0(%[p_sd8x8]) \n\t"
PTR_ADDIU "%[p_sd8x8], %[p_sd8x8], 0x8 \n\t"
"dli $8, 0x1 \n\t"
"dmtc1 $8, $f8 \n\t"
"dli $8, 0x8 \n\t"
"dmtc1 $8, $f10 \n\t"
WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)
"dmfc1 $8, $f20 \n\t"
"sb $8, 0x0(%[p_mad8x8]) \n\t"
"dmfc1 $8, $f22 \n\t"
"sb $8, 0x1(%[p_mad8x8]) \n\t"
PTR_ADDIU "%[p_mad8x8], %[p_mad8x8], 0x2 \n\t"
"paddw $f20, $f16, $f18 \n\t"
"punpcklwd $f16, $f20, $f20 \n\t"
"punpckhwd $f18, $f20, $f20 \n\t"
"paddw $f16, $f16, $f18 \n\t"
"dmfc1 $8, $f16 \n\t"
"sw $8, 0x0(%[psqdiff16x16]) \n\t"
PTR_ADDIU "%[psqdiff16x16], %[psqdiff16x16], 0x4 \n\t"
PTR_SUBU "$15, $15, $13 \n\t"
PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "$15, $15, 0x10 \n\t"
PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
"bnez %[iPicWidth], 2b \n\t"
"nop \n\t"
"move %[iPicWidth], $9 \n\t"
"move $15, $10 \n\t"
"move %[ref_data], $11 \n\t"
PTR_ADDU "$15, $15, $13 \n\t"
PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
"bnez %[iPicHeight], 1b \n\t"
"nop \n\t"
"sw $14, 0x0(%[psadframe]) \n\t"
: [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
[iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
[psum16x16]"+&r"((int *)psum16x16), [psqsum16x16]"+&r"((int *)psqsum16x16),
[psqdiff16x16]"+&r"((int *)psqdiff16x16), [p_sd8x8]"+&r"((int *)p_sd8x8),
[p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
: [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
[psadframe]"r"((int *)psadframe)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
"$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
"$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void VAACalcSadVar_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
int32_t *psqsum16x16) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"move $15, %[cur_data] \n\t"
"dsrl %[iPicWidth], %[iPicWidth], 0x4 \n\t"
"dsrl %[iPicHeight], %[iPicHeight], 0x4 \n\t"
"dsll $13, %[iPicStride], 0x4 \n\t"
"xor $f0, $f0, $f0 \n\t"
"xor $f2, $f2, $f2 \n\t"
"xor $f28, $f28, $f28 \n\t"
"xor $f30, $f30, $f30 \n\t"
"xor $14, $14, $14 \n\t"
"1: \n\t"
"move $9, %[iPicWidth] \n\t"
"move $10, $15 \n\t"
"move $11, %[ref_data] \n\t"
"2: \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
"xor $f20, $f20, $f20 \n\t"
"xor $f22, $f22, $f22 \n\t"
"xor $f16, $f16, $f16 \n\t"
"xor $f18, $f18, $f18 \n\t"
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
"paddw $f28, $f24, $f28 \n\t"
"paddw $f30, $f26, $f30 \n\t"
"dmfc1 $8, $f24 \n\t"
"sw $8, 0x0(%[psad8x8]) \n\t"
"dmfc1 $8, $f26 \n\t"
"sw $8, 0x4(%[psad8x8]) \n\t"
"xor $f24, $f24, $f24 \n\t"
"xor $f26, $f26, $f26 \n\t"
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
"paddw $f28, $f24, $f28 \n\t"
"paddw $f30, $f26, $f30 \n\t"
"dmfc1 $8, $f24 \n\t"
"sw $8, 0x8(%[psad8x8]) \n\t"
"dmfc1 $8, $f26 \n\t"
"sw $8, 0xc(%[psad8x8]) \n\t"
PTR_ADDIU "%[psad8x8], %[psad8x8], 0x10 \n\t"
"paddw $f20, $f20, $f22 \n\t"
"dmfc1 $8, $f20 \n\t"
"sw $8, 0x0(%[psum16x16]) \n\t"
PTR_ADDIU "%[psum16x16], %[psum16x16], 0x4 \n\t"
"paddw $f20, $f16, $f18 \n\t"
"punpcklwd $f16, $f20, $f20 \n\t"
"punpckhwd $f18, $f20, $f20 \n\t"
"paddw $f16, $f16, $f18 \n\t"
"dmfc1 $8, $f16 \n\t"
"sw $8, 0x0(%[psqsum16x16]) \n\t"
PTR_ADDIU "%[psqsum16x16], %[psqsum16x16], 0x4 \n\t"
PTR_SUBU "$15, $15, $13 \n\t"
PTR_SUBU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "$15, $15, 0x10 \n\t"
PTR_ADDIU "%[ref_data], %[ref_data], 0x10 \n\t"
PTR_ADDIU "%[iPicWidth], %[iPicWidth], -0x1 \n\t"
"bnez %[iPicWidth], 2b \n\t"
"nop \n\t"
"move %[iPicWidth], $9 \n\t"
"move $15, $10 \n\t"
"move %[ref_data], $11 \n\t"
PTR_ADDU "$15, $15, $13 \n\t"
PTR_ADDU "%[ref_data], %[ref_data], $13 \n\t"
PTR_ADDIU "%[iPicHeight], %[iPicHeight], -0x1 \n\t"
"bnez %[iPicHeight], 1b \n\t"
"nop \n\t"
"paddw $f28, $f28, $f30 \n\t"
"dmfc1 $8, $f28 \n\t"
"sw $8, 0x0(%[psadframe]) \n\t"
: [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
[iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
[psqsum16x16]"+&r"((int *)psqsum16x16)
: [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
[psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
: "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
"$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
"$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}