ref: 77406e6a66ae9c58b5ef3e7a37ac2714b8a0db9f
dir: /codec/decoder/core/mips/dct_mmi.c/
/*!
* \copy
* Copyright (c) 2009-2018, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* \file dct_mmi.c
*
* \brief Loongson optimization
*
* \date 17/07/2018 Created
*
*************************************************************************************
*/
#include <stdint.h>
#include "asmdefs_mmi.h"
#define LOAD_2_LEFT_AND_ADD \
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
"lbu $9, -0x1(%[pPred]) \n\t" \
PTR_ADDU "$8, $8, $9 \n\t" \
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
"lbu $9, -0x1(%[pPred]) \n\t" \
PTR_ADDU "$8, $8, $9 \n\t"
unsigned char mmi_dc_0x80[16] __attribute__((aligned(16))) = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
short mmi_wd_0x02[8] __attribute__((aligned(16))) = {2, 2, 2, 2, 2, 2, 2, 2};
short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4, -3, -2, -1, 0};
short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0, 1, 2, 3, 4};
unsigned char mmi_01bytes[16]__attribute__((aligned(16))) = {1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1};
void IdctResAddPred_mmi(uint8_t *pPred, const int32_t kiStride, int16_t *pRs) {
__asm__ volatile (
".set arch=loongson3a \n\t"
"dli $8, 0x1 \n\t"
"gsldxc1 $f0, 0x0(%[pRs], $0) \n\t"
"gsldxc1 $f2, 0x8(%[pRs], $0) \n\t"
"gsldxc1 $f4, 0x10(%[pRs], $0) \n\t"
"gsldxc1 $f6, 0x18(%[pRs], $0) \n\t"
"dmtc1 $8, $f14 \n\t"
MMI_Trans4x4H_SINGLE($f0, $f2, $f4, $f6, $f8)
MMI_IDCT_SINGLE($f2, $f4, $f6, $f8, $f0, $f12, $f14)
MMI_Trans4x4H_SINGLE($f2, $f6, $f0, $f8, $f4)
MMI_IDCT_SINGLE($f6, $f0, $f8, $f4, $f2, $f12, $f14)
"dli $8, 0x20 \n\t"
"xor $f14, $f14, $f14 \n\t"
"dmtc1 $8, $f12 \n\t"
"pshufh $f12, $f12, $f14 \n\t"
"dli $8, 0x6 \n\t"
"dmtc1 $8, $f16 \n\t"
MMI_StoreDiff4P_SINGLE($f6, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
MMI_StoreDiff4P_SINGLE($f8, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
MMI_StoreDiff4P_SINGLE($f2, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
MMI_StoreDiff4P_SINGLE($f4, $f0, $f12, $f14, %[pPred], %[pPred], $f16)
: [pPred]"+&r"((unsigned char *)pPred)
: [pRs]"r"((unsigned char *)pRs), [kiStride]"r"((int)kiStride)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
"$f14", "$f16"
);
}
void WelsDecoderI16x16LumaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
__asm__ volatile(
".set arch=loongson3a \n\t"
"dli $8, 0x5 \n\t"
"gsldxc1 $f10, 0x0(%[mmi_01bytes], $0) \n\t"
"dmtc1 $8, $f8 \n\t"
"move $10, %[pPred] \n\t"
PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
"gslqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
"xor $f4, $f4, $f4 \n\t"
"pasubub $f0, $f0, $f4 \n\t"
"pasubub $f2, $f2, $f4 \n\t"
"biadd $f0, $f0 \n\t"
"biadd $f2, $f2 \n\t"
"paddh $f0, $f0, $f2 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $8, -0x1(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
PTR_ADDIU "$8, $8, 0x10 \n\t"
"dmtc1 $8, $f4 \n\t"
"paddh $f0, $f0, $f4 \n\t"
"psrlw $f0, $f0, $f8 \n\t"
"pmuluw $f0, $f0, $f10 \n\t"
"punpcklwd $f0, $f0, $f0 \n\t"
"mov.d $f2, $f0 \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0($10) \n\t"
: [pPred] "+&r"((unsigned char *)pPred)
: [kiStride] "r"((int)kiStride),
[mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10"
);
}
void WelsDecoderI16x16LumaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"move $10, %[pPred] \n\t"
PTR_ADDIU "%[pPred], %[pPred], -0x1 \n\t"
PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
"gsldlc1 $f0, 0x7(%[pPred]) \n\t"
"xor $f28, $f28, $f28 \n\t"
"gsldrc1 $f0, 0x0(%[pPred]) \n\t"
"gslqc1 $f22, $f20, 0x0(%[mmi_plane_dec]) \n\t"
"punpckhbh $f2, $f0, $f28 \n\t"
"punpcklbh $f0, $f0, $f28 \n\t"
"pmullh $f0, $f0, $f20 \n\t"
"gsldlc1 $f4, 0x10(%[pPred]) \n\t"
"pmullh $f2, $f2, $f22 \n\t"
"gsldrc1 $f4, 0x9(%[pPred]) \n\t"
"gslqc1 $f26, $f24, 0x0(%[mmi_plane_inc]) \n\t"
"punpckhbh $f6, $f4, $f28 \n\t"
"punpcklbh $f4, $f4, $f28 \n\t"
"pmullh $f4, $f4, $f24 \n\t"
"pmullh $f6, $f6, $f26 \n\t"
"psubh $f4, $f4, $f0 \n\t"
"psubh $f6, $f6, $f2 \n\t"
SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
"dmfc1 $8, $f4 \n\t"
"seh $8, $8 \n\t"
"mul $8, $8, 0x5 \n\t"
PTR_ADDIU "$8, $8, 0x20 \n\t"
"sra $8, $8, 0x6 \n\t"
MMI_Copy8Times($f4, $f6, $f28, $8)
"lbu $9, 0x10(%[pPred]) \n\t"
PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
%[kiStride], $11)
PTR_ADDIU "%[pPred], %[pPred], 0x3 \n\t"
"dsll $11, %[kiStride], 0x3 \n\t"
PTR_ADDU "$11, $11, %[pPred] \n\t"
"lbu $8, 0x0($11) \n\t"
PTR_ADDU "$9, $9, $8 \n\t"
"dsll $9, $9, 0x4 \n\t"
PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16, $f18, %[pPred],
%[kiStride], $11)
"xor $f16, $f16, $f16 \n\t"
"punpcklbh $f0, $f2, $f16 \n\t"
"punpckhbh $f2, $f2, $f16 \n\t"
"pmullh $f0, $f0, $f20 \n\t"
"pmullh $f2, $f2, $f22 \n\t"
"punpcklbh $f28, $f30, $f16 \n\t"
"punpckhbh $f30, $f30, $f16 \n\t"
"pmullh $f28, $f28, $f24 \n\t"
"pmullh $f30, $f30, $f26 \n\t"
"psubh $f28, $f28, $f0 \n\t"
"psubh $f30, $f30, $f2 \n\t"
"xor $f8, $f8, $f8 \n\t"
SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
"dmfc1 $8, $f28 \n\t"
"seh $8, $8 \n\t"
"mul $8, $8, 0x5 \n\t"
PTR_ADDIU "$8, $8, 0x20 \n\t"
"sra $8, $8, 0x6 \n\t"
MMI_Copy8Times($f16, $f18, $f8, $8)
"move %[pPred], $10 \n\t"
PTR_ADDIU "$9, $9, 0x10 \n\t"
"mul $8, $8, -0x7 \n\t"
PTR_ADDU "$9, $9, $8 \n\t"
MMI_Copy8Times($f0, $f2, $f8, $9)
"xor $8, $8, $8 \n\t"
"gslqc1 $f22, $f20, 0x0(%[mmi_plane_inc_minus]) \n\t"
"dli $11, 0x5 \n\t"
"dmtc1 $11, $f30 \n\t"
"1: \n\t"
"pmullh $f8, $f4, $f20 \n\t"
"pmullh $f10, $f6, $f22 \n\t"
"paddh $f8, $f8, $f0 \n\t"
"paddh $f10, $f10, $f2 \n\t"
"psrah $f8, $f8, $f30 \n\t"
"psrah $f10, $f10, $f30 \n\t"
"pmullh $f12, $f4, $f24 \n\t"
"pmullh $f14, $f6, $f26 \n\t"
"paddh $f12, $f12, $f0 \n\t"
"paddh $f14, $f14, $f2 \n\t"
"psrah $f12, $f12, $f30 \n\t"
"psrah $f14, $f14, $f30 \n\t"
"packushb $f8, $f8, $f10 \n\t"
"packushb $f10, $f12, $f14 \n\t"
"gssqc1 $f10, $f8, 0x0(%[pPred]) \n\t"
"paddh $f0, $f0, $f16 \n\t"
"paddh $f2, $f2, $f18 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
PTR_ADDIU "$8, $8, 0x1 \n\t"
PTR_ADDIU "$11, $8, -0x10 \n\t"
"bnez $11, 1b \n\t"
"nop \n\t"
: [pPred]"+&r"((unsigned char *)pPred)
: [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
[mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
: "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
"$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
"$f28", "$f30"
);
RECOVER_REG;
}
#define COPY_16_TIMES(r0, f0, f2, f4, f6, f8) \
"gslqc1 "#f2", "#f0", -0x10("#r0") \n\t" \
"dsrl "#f0", "#f2", "#f4" \n\t" \
"pmuluw "#f0", "#f0", "#f6" \n\t" \
"punpcklwd "#f0", "#f0", "#f0" \n\t" \
"mov.d "#f2", "#f0" \n\t"
#define MMI_PRED_H_16X16_TWO_LINE_DEC \
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8) \
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t" \
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t" \
COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8) \
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
void WelsDecoderI16x16LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
__asm__ volatile (
".set arch=loongson3a \n\t"
"dli $8, 56 \n\t"
"dmtc1 $8, $f4 \n\t"
"gsldxc1 $f6, 0x0(%[mmi_01bytes], $0) \n\t"
"xor $f8, $f8, $f8 \n\t"
COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
COPY_16_TIMES(%[pPred], $f0, $f2, $f4, $f6, $f8)
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
MMI_PRED_H_16X16_TWO_LINE_DEC
MMI_PRED_H_16X16_TWO_LINE_DEC
MMI_PRED_H_16X16_TWO_LINE_DEC
MMI_PRED_H_16X16_TWO_LINE_DEC
MMI_PRED_H_16X16_TWO_LINE_DEC
MMI_PRED_H_16X16_TWO_LINE_DEC
MMI_PRED_H_16X16_TWO_LINE_DEC
: [pPred]"+&r"((unsigned char *)pPred)
: [kiStride]"r"((int)kiStride),
[mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
);
}
void WelsDecoderI16x16LumaPredV_mmi(uint8_t *pPred, const int32_t kiStride) {
__asm__ volatile(
".set arch=loongson3a \n\t"
PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
"gslqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
: [pPred] "+&r"((unsigned char *)pPred)
: [kiStride] "r"((int)kiStride)
: "memory", "$f0", "$f2"
);
}
void WelsDecoderI16x16LumaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
PTR_SUBU "$8, %[pPred], %[kiStride] \n\t"
"gslqc1 $f2, $f0, 0x0($8) \n\t"
"xor $f28, $f28, $f28 \n\t"
"pasubub $f0, $f0, $f28 \n\t"
"pasubub $f2, $f2, $f28 \n\t"
"biadd $f0, $f0 \n\t"
"biadd $f2, $f2 \n\t"
"paddh $f0, $f0, $f2 \n\t"
"dmfc1 $8, $f0 \n\t"
PTR_ADDIU "$8, $8, 0x8 \n\t"
"dsra $8, $8, 0x4 \n\t"
MMI_Copy16Times($f4, $f6, $f28, $8)
"mov.d $f0, $f4 \n\t"
"mov.d $f2, $f6 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
: [pPred]"+&r"((unsigned char *)pPred)
: [kiStride]"r"((int)kiStride)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6"
);
RECOVER_REG;
}
void WelsDecoderI16x16LumaPredDcNA_mmi(uint8_t *pPred, const int32_t kiStride) {
__asm__ volatile(
".set arch=loongson3a \n\t"
"gslqc1 $f2, $f0, 0x0(%[mmi_dc_0x80]) \n\t"
"mov.d $f4, $f0 \n\t"
"mov.d $f6, $f2 \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssqc1 $f6, $f4, 0x0(%[pPred]) \n\t"
: [pPred] "+&r"((unsigned char *)pPred)
: [kiStride] "r"((int)kiStride), [mmi_dc_0x80] "r"(mmi_dc_0x80)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6"
);
}
void WelsDecoderIChromaPredPlane_mmi(uint8_t *pPred, const int32_t kiStride) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"move $10, %[pPred] \n\t"
PTR_ADDIU "%[pPred], %[pPred], -0x1 \n\t"
PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
"gsldlc1 $f0, 0x7(%[pPred]) \n\t"
"xor $f28, $f28, $f28 \n\t"
"gsldrc1 $f0, 0x0(%[pPred]) \n\t"
"gsldxc1 $f20, 0x0(%[mmi_plane_dec_c], $0) \n\t"
"punpcklbh $f0, $f0, $f28 \n\t"
"gsldlc1 $f4, 0xc(%[pPred]) \n\t"
"pmullh $f0, $f0, $f20 \n\t"
"gsldrc1 $f4, 0x5(%[pPred]) \n\t"
"gsldxc1 $f24, 0x0(%[mmi_plane_inc_c], $0) \n\t"
"punpcklbh $f4, $f4, $f28 \n\t"
"pmullh $f4, $f4, $f24 \n\t"
"psubh $f4, $f4, $f0 \n\t"
"xor $f6, $f6, $f6 \n\t"
"xor $f8, $f8, $f8 \n\t"
SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
"dmfc1 $8, $f4 \n\t"
"seh $8, $8 \n\t"
"mul $8, $8, 0x11 \n\t"
PTR_ADDIU "$8, $8, 0x10 \n\t"
"sra $8, $8, 0x5 \n\t"
MMI_Copy8Times($f4, $f6, $f8, $8)
"lbu $9, 0x8(%[pPred]) \n\t"
PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
PTR_ADDIU "%[pPred], %[pPred], 0x3 \n\t"
"dsll $11, %[kiStride], 0x2 \n\t"
PTR_ADDU "$11, $11, %[pPred] \n\t"
"lbu $8, 0x0($11) \n\t"
PTR_ADDU "$9, $9, $8 \n\t"
"dsll $9, $9, 0x4 \n\t"
PTR_ADDIU "%[pPred], %[pPred], -0x3 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pPred], %[kiStride], $11)
"xor $f16, $f16, $f16 \n\t"
"punpckhbh $f0, $f0, $f16 \n\t"
"pmullh $f0, $f0, $f20 \n\t"
"punpckhbh $f28, $f28, $f16 \n\t"
"pmullh $f28, $f28, $f24 \n\t"
"psubh $f28, $f28, $f0 \n\t"
"xor $f30, $f30, $f30 \n\t"
"xor $f8, $f8, $f8 \n\t"
SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
"dmfc1 $8, $f28 \n\t"
"seh $8, $8 \n\t"
"mul $8, $8, 0x11 \n\t"
PTR_ADDIU "$8, $8, 0x10 \n\t"
"sra $8, $8, 0x5 \n\t"
MMI_Copy8Times($f16, $f18, $f8, $8)
"move %[pPred], $10 \n\t"
PTR_ADDIU "$9, $9, 0x10 \n\t"
"mul $8, $8, -0x3 \n\t"
PTR_ADDU "$9, $9, $8 \n\t"
MMI_Copy8Times($f0, $f2, $f8, $9)
"xor $8, $8, $8 \n\t"
"gslqc1 $f22, $f20, 0x0(%[mmi_plane_mul_b_c]) \n\t"
"dli $11, 0x5 \n\t"
"dmtc1 $11, $f30 \n\t"
"1: \n\t"
"pmullh $f8, $f4, $f20 \n\t"
"pmullh $f10, $f6, $f22 \n\t"
"paddh $f8, $f8, $f0 \n\t"
"paddh $f10, $f10, $f2 \n\t"
"psrah $f8, $f8, $f30 \n\t"
"psrah $f10, $f10, $f30 \n\t"
"packushb $f8, $f8, $f10 \n\t"
"gssdxc1 $f8, 0x0(%[pPred], $0) \n\t"
"paddh $f0, $f0, $f16 \n\t"
"paddh $f2, $f2, $f18 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
PTR_ADDIU "$8, $8, 0x1 \n\t"
PTR_ADDIU "$11, $8, -0x8 \n\t"
"bnez $11, 1b \n\t"
"nop \n\t"
: [pPred]"+&r"((unsigned char *)pPred)
: [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
[mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
: "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
"$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
);
RECOVER_REG;
}
void WelsDecoderIChromaPredDc_mmi(uint8_t *pPred, const int32_t kiStride) {
__asm__ volatile(
".set arch=loongson3a \n\t"
"move $10, %[pPred] \n\t"
PTR_SUBU "%[pPred], %[pPred], %[kiStride] \n\t"
"gsldxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $8, -0x1(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
"dmtc1 $8, $f2 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $8, -0x1(%[pPred]) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"lbu $9, -0x1(%[pPred]) \n\t"
PTR_ADDU "$8, $8, $9 \n\t"
"dmtc1 $8, $f4 \n\t"
"xor $f8, $f8, $f8 \n\t"
"punpcklwd $f6, $f0, $f8 \n\t"
"punpckhwd $f0, $f0, $f8 \n\t"
"pasubub $f0, $f0, $f8 \n\t"
"pasubub $f6, $f6, $f8 \n\t"
"biadd $f0, $f0 \n\t"
"biadd $f6, $f6 \n\t"
"paddd $f6, $f6, $f2 \n\t"
"paddd $f2, $f4, $f0 \n\t"
"dli $8, 0x2 \n\t"
"dmtc1 $8, $f8 \n\t"
"gsldxc1 $f12, 0x0(%[mmi_01bytes], $0) \n\t"
"dli $8, 0x3 \n\t"
"dmtc1 $8, $f10 \n\t"
"paddd $f0, $f0, $f8 \n\t"
"dsrl $f0, $f0, $f8 \n\t"
"paddd $f4, $f4, $f8 \n\t"
"dsrl $f4, $f4, $f8 \n\t"
"paddd $f6, $f6, $f8 \n\t"
"paddd $f6, $f6, $f8 \n\t"
"dsrl $f6, $f6, $f10 \n\t"
"paddd $f2, $f2, $f8 \n\t"
"paddd $f2, $f2, $f8 \n\t"
"dsrl $f2, $f2, $f10 \n\t"
"dli $8, 0x20 \n\t"
"dmtc1 $8, $f8 \n\t"
"pmuluw $f0, $f0, $f12 \n\t"
"pmuluw $f6, $f6, $f12 \n\t"
"dsll $f0, $f0, $f8 \n\t"
"xor $f0, $f0, $f6 \n\t"
"pmuluw $f4, $f4, $f12 \n\t"
"pmuluw $f2, $f2, $f12 \n\t"
"dsll $f2, $f2, $f8 \n\t"
"xor $f2, $f2, $f4 \n\t"
"gssdxc1 $f0, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f0, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f0, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f0, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f2, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f2, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f2, 0x0($10, $0) \n\t"
PTR_ADDU "$10, $10, %[kiStride] \n\t"
"gssdxc1 $f2, 0x0($10, $0) \n\t"
: [pPred] "+&r"((unsigned char *)pPred)
: [kiStride] "r"((int)kiStride),
[mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
: "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
"$f12"
);
}
void WelsDecoderIChromaPredDcTop_mmi(uint8_t *pPred, const int32_t kiStride) {
BACKUP_REG;
__asm__ volatile (
".set arch=loongson3a \n\t"
"dli $8, 0x4e \n\t"
"dmtc1 $8, $f16 \n\t"
"dli $8, 0xb1 \n\t"
"dmtc1 $8, $f18 \n\t"
"dli $8, 0x2 \n\t"
"dmtc1 $8, $f20 \n\t"
PTR_SUBU "$8, %[pPred], %[kiStride] \n\t"
"gsldxc1 $f0, 0x0($8, $0) \n\t"
"xor $f28, $f28, $f28 \n\t"
"punpckhbh $f2, $f0, $f28 \n\t"
"punpcklbh $f0, $f0, $f28 \n\t"
"pshufh $f4, $f0, $f16 \n\t"
"pshufh $f6, $f2, $f16 \n\t"
"paddh $f0, $f0, $f4 \n\t"
"paddh $f2, $f2, $f6 \n\t"
"pshufh $f8, $f0, $f18 \n\t"
"pshufh $f14, $f2, $f18 \n\t"
"paddh $f2, $f2, $f14 \n\t"
"paddh $f0, $f0, $f8 \n\t"
"gslqc1 $f26, $f24, 0x0(%[mmi_wd_0x02]) \n\t"
"paddh $f0, $f0, $f24 \n\t"
"paddh $f2, $f2, $f26 \n\t"
"psrah $f0, $f0, $f20 \n\t"
"psrah $f2, $f2, $f20 \n\t"
"packushb $f0, $f0, $f2 \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
PTR_ADDU "%[pPred], %[pPred], %[kiStride] \n\t"
"gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
: [pPred] "+&r"((unsigned char *)pPred)
: [kiStride] "r"((int)kiStride), [mmi_wd_0x02] "r"((short *)mmi_wd_0x02)
: "memory", "$8", "$f0", "$f2", "$f4", "$f6"
);
RECOVER_REG;
}
void WelsDecoderI4x4LumaPredH_mmi(uint8_t *pPred, const int32_t kiStride) {
__asm__ volatile(
".set arch=loongson3a \n\t"
"gsldxc1 $f8, 0x0(%[mmi_01bytes], $0) \n\t"
"lbu $8, -0x1(%[pPred]) \n\t"
"dmtc1 $8, $f0 \n\t"
"pmuluw $f0, $f0, $f8 \n\t"
PTR_ADDU "$9, %[pPred], %[kiStride] \n\t"
"lbu $8, -0x1($9) \n\t"
"dmtc1 $8, $f2 \n\t"
"pmuluw $f2, $f2, $f8 \n\t"
PTR_ADDU "$10, $9, %[kiStride] \n\t"
"lbu $8, -0x1($10) \n\t"
"dmtc1 $8, $f4 \n\t"
"pmuluw $f4, $f4, $f8 \n\t"
PTR_ADDU "$11, $10, %[kiStride] \n\t"
"lbu $8, -0x1($11) \n\t"
"dmtc1 $8, $f6 \n\t"
"pmuluw $f6, $f6, $f8 \n\t"
"gsswxc1 $f0, 0x0(%[pPred], $0) \n\t"
"gsswxc1 $f2, 0x0($9, $0) \n\t"
"gsswxc1 $f4, 0x0($10, $0) \n\t"
"gsswxc1 $f6, 0x0($11, $0) \n\t"
: [pPred] "+&r"((unsigned char *)pPred)
: [kiStride] "r"((int)kiStride),
[mmi_01bytes] "r"((unsigned char *)mmi_01bytes)
: "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8"
);
}