shithub: openh264

ref: 1cf8b7ada9e2ac9506d6e8d953f1d389a05b1d2e
dir: /codec/processing/src/mips/vaa_mmi.c/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2018, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file    vaa_mmi.c
 *
 * \brief   Loongson optimization
 *
 * \date    23/07/2018 Created
 *
 *************************************************************************************
 */
#include <stdint.h>
#include "asmdefs_mmi.h"

//f4 is 0x1, f6 is 0x8
#define WELS_MAX_REG_MMI(f0, f2, f4, f6) \
  "punpckhwd  $f4, "#f0", "#f0"    \n\t" \
  "punpckhwd  $f6, "#f2", "#f2"    \n\t" \
  "pmaxub     "#f0", "#f0", $f4    \n\t" \
  "pmaxub     "#f2", "#f2", $f6    \n\t" \
  "pshufh     $f4, "#f0", "#f4"    \n\t" \
  "pshufh     $f6, "#f2", "#f4"    \n\t" \
  "pmaxub     "#f0", "#f0", $f4    \n\t" \
  "pmaxub     "#f2", "#f2", $f6    \n\t" \
  "dsrl       $f4, "#f0", "#f6"    \n\t" \
  "dsrl       $f6, "#f2", "#f6"    \n\t" \
  "pmaxub     "#f0", "#f0", $f4    \n\t" \
  "pmaxub     "#f2", "#f2", $f6    \n\t"

#define WELS_SAD_SD_MAD_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
  "pasubub    $f12, $f4, $f0                      \n\t" \
  "pasubub    $f14, $f6, $f2                      \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      "#f4", "#f4", $f12                  \n\t" \
  "paddw      "#f6", "#f6", $f14                  \n\t" \
  "pasubub    $f12, $f8, $f0                      \n\t" \
  "pasubub    $f14, $f10, $f2                     \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      "#f8", "#f8", $f12                  \n\t" \
  "paddw      "#f10", "#f10", $f14                \n\t" \
  "pasubub    $f12, $f4, $f8                      \n\t" \
  "pasubub    $f14, $f6, $f10                     \n\t" \
  "pmaxub     "#f12", "#f12", $f12                \n\t" \
  "pmaxub     "#f14", "#f14", $f14                \n\t" \
  "pasubub    $f12, $f12, $f0                     \n\t" \
  "pasubub    $f14, $f14, $f2                     \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      "#f0", "#f0", $f12                  \n\t" \
  "paddw      "#f2", "#f2", $f14                  \n\t" \
  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"

#define WELS_SAD_16x2_MMI(f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, r1, r2, r3) \
  "gslqc1     "#f1",  "#f2",  0x00("#r1")         \n\t" \
  "gslqc1     "#f3",  "#f4",  0x00("#r2")         \n\t" \
  PTR_ADDU    ""#r1", "#r1",  "#r3"               \n\t" \
  "gslqc1     "#f5",  "#f6",  0x00("#r1")         \n\t" \
  PTR_ADDU    ""#r2", "#r2",  "#r3"               \n\t" \
  "gslqc1     "#f7",  "#f8",  0x00("#r2")         \n\t" \
  "pasubub    "#f1",  "#f1",  "#f3"               \n\t" \
  "pasubub    "#f2",  "#f2",  "#f4"               \n\t" \
  "biadd      "#f1",  "#f1"                       \n\t" \
  "biadd      "#f2",  "#f2"                       \n\t" \
  "pasubub    "#f5",  "#f5",  "#f7"               \n\t" \
  "pasubub    "#f6",  "#f6",  "#f8"               \n\t" \
  "biadd      "#f5",  "#f5"                       \n\t" \
  "biadd      "#f6",  "#f6"                       \n\t" \
  "paddw      "#f9",  "#f9",  "#f1"               \n\t" \
  "paddw      "#f9",  "#f9",  "#f5"               \n\t" \
  "paddw      "#f10", "#f10", "#f2"               \n\t" \
  "paddw      "#f10", "#f10", "#f6"               \n\t" \
  PTR_ADDU    ""#r1", "#r1",  "#r3"               \n\t" \
  PTR_ADDU    ""#r2", "#r2",  "#r3"               \n\t"

#define WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI(r0, r1, r2) \
  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
  "pasubub    $f12, $f4, $f8                      \n\t" \
  "pasubub    $f14, $f6, $f10                     \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      $f28, $f28, $f12                    \n\t" \
  "paddw      $f30, $f30, $f14                    \n\t" \
  "pasubub    $f12, $f4, $f8                      \n\t" \
  "pasubub    $f14, $f6, $f10                     \n\t" \
  "pasubub    $f8, $f4, $f0                       \n\t" \
  "pasubub    $f10, $f6, $f2                      \n\t" \
  "biadd      $f8, $f8                            \n\t" \
  "biadd      $f10, $f10                          \n\t" \
  "paddw      $f24, $f24, $f8                     \n\t" \
  "paddw      $f26, $f26, $f10                    \n\t" \
  "punpcklbh  $f8, $f6, $f2                       \n\t" \
  "punpckhbh  $f10, $f6, $f2                      \n\t" \
  "punpckhbh  $f6, $f4, $f0                       \n\t" \
  "punpcklbh  $f4, $f4, $f0                       \n\t" \
  "pmaddhw    $f4, $f4, $f4                       \n\t" \
  "pmaddhw    $f6, $f6, $f6                       \n\t" \
  "pmaddhw    $f8, $f8, $f8                       \n\t" \
  "pmaddhw    $f10, $f10, $f10                    \n\t" \
  "paddw      $f20, $f20, $f4                     \n\t" \
  "paddw      $f22, $f22, $f6                     \n\t" \
  "paddw      $f20, $f20, $f8                     \n\t" \
  "paddw      $f22, $f22, $f10                    \n\t" \
  "punpcklbh  $f4, $f12, $f0                      \n\t" \
  "punpckhbh  $f6, $f12, $f0                      \n\t" \
	"punpcklbh  $f12, $f14, $f2                     \n\t" \
	"punpckhbh  $f14, $f14, $f2                     \n\t" \
  "pmaddhw    $f4, $f4, $f4                       \n\t" \
  "pmaddhw    $f6, $f6, $f6                       \n\t" \
  "pmaddhw    $f12, $f12, $f12                    \n\t" \
  "pmaddhw    $f14, $f14, $f14                    \n\t" \
  "paddw      $f16, $f16, $f4                     \n\t" \
  "paddw      $f18, $f18, $f6                     \n\t" \
  "paddw      $f16, $f16, $f12                    \n\t" \
  "paddw      $f18, $f18, $f14                    \n\t" \
  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"

#define WELS_SAD_BGD_SQDIFF_16x1_MMI(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
  "punpcklbh  $f8, $f4, $f0                       \n\t" \
  "punpckhbh  $f10, $f4, $f0                      \n\t" \
  "punpcklbh  $f12, $f6, $f2                      \n\t" \
  "punpckhbh  $f14, $f6, $f2                      \n\t" \
  "pmaddhw    $f8, $f8, $f8                       \n\t" \
  "pmaddhw    $f10, $f10, $f10                    \n\t" \
  "pmaddhw    $f12, $f12, $f12                    \n\t" \
  "pmaddhw    $f14, $f14, $f14                    \n\t" \
  "paddw      $f8, $f8, $f12                      \n\t" \
  "paddw      $f10, $f10, $f14                    \n\t" \
  "punpckhwd  $f12, $f0, $f8                      \n\t" \
  "punpckhwd  $f14, $f0, $f10                     \n\t" \
  "punpcklwd  $f8, $f0, $f8                       \n\t" \
  "punpcklwd  $f10, $f0, $f10                     \n\t" \
  "paddw      $f8, $f8, $f12                      \n\t" \
  "paddw      $f10, $f10, $f14                    \n\t" \
  "paddw      "#f0", "#f0", $f8                   \n\t" \
  "paddw      "#f2", "#f2", $f10                  \n\t" \
  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
  "pasubub    $f12, $f4, $f0                      \n\t" \
  "pasubub    $f14, $f6, $f2                      \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      "#f4", "#f4", $f12                  \n\t" \
  "paddw      "#f6", "#f6", $f14                  \n\t" \
  "pasubub    $f12, $f8, $f0                      \n\t" \
  "pasubub    $f14, $f10, $f2                     \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "punpcklwd  $f14, $f14, $f14                    \n\t" \
  "punpckhwd  $f14, $f12, $f14                    \n\t" \
  "punpcklwd  $f12, $f0, $f12                     \n\t" \
  "paddw      "#f4", "#f4", $f12                  \n\t" \
  "paddw      "#f6", "#f6", $f14                  \n\t" \
  "pasubub    $f12, $f4, $f8                      \n\t" \
  "pasubub    $f14, $f6, $f10                     \n\t" \
  "pmaxub     "#f8", "#f8", $f12                  \n\t" \
  "pmaxub     "#f10", "#f10", $f14                \n\t" \
  "paddw      $f4, $f0, $f12                      \n\t" \
  "paddw      $f6, $f0, $f14                      \n\t" \
  "pasubub    $f12, $f12, $f0                     \n\t" \
  "pasubub    $f14, $f14, $f2                     \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      "#f0", "#f0", $f12                  \n\t" \
  "paddw      "#f2", "#f2", $f14                  \n\t" \
  "paddw      $f12, $f0, $f4                      \n\t" \
  "paddw      $f14, $f0, $f6                      \n\t" \
  "punpcklbh  $f4, $f12, $f0                      \n\t" \
  "punpckhbh  $f6, $f12, $f0                      \n\t" \
  "punpcklbh  $f12, $f14, $f2                     \n\t" \
  "punpckhbh  $f14, $f14, $f2                     \n\t" \
  "pmaddhw    $f4, $f4, $f4                       \n\t" \
  "pmaddhw    $f6, $f6, $f6                       \n\t" \
  "pmaddhw    $f12, $f12, $f12                    \n\t" \
  "pmaddhw    $f14, $f14, $f14                    \n\t" \
  "paddw      "#f12", "#f12", $f4                 \n\t" \
  "paddw      "#f14", "#f14", $f6                 \n\t" \
  "paddw      "#f12", "#f12", $f12                \n\t" \
  "paddw      "#f14", "#f14", $f14                \n\t" \
  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"

#define WELS_SAD_SUM_SQSUM_16x1_MMI(r0, r1, r2) \
  "gslqc1     $f6, $f4, 0x0("#r0")                \n\t" \
  "gslqc1     $f10, $f8, 0x0("#r1")               \n\t" \
  "pasubub    $f12, $f4, $f8                      \n\t" \
  "pasubub    $f14, $f6, $f10                     \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      $f24, $f24, $f12                    \n\t" \
  "paddw      $f26, $f26, $f14                    \n\t" \
  "pasubub    $f12, $f4, $f0                      \n\t" \
  "pasubub    $f14, $f6, $f2                      \n\t" \
  "biadd      $f12, $f12                          \n\t" \
  "biadd      $f14, $f14                          \n\t" \
  "paddw      $f20, $f20, $f12                    \n\t" \
  "paddw      $f22, $f22, $f14                    \n\t" \
  "punpcklbh  $f8, $f6, $f2                       \n\t" \
  "punpckhbh  $f10, $f6, $f2                      \n\t" \
  "punpckhbh  $f6, $f4, $f0                       \n\t" \
  "punpcklbh  $f4, $f4, $f0                       \n\t" \
  "pmaddhw    $f4, $f4, $f4                       \n\t" \
  "pmaddhw    $f6, $f6, $f6                       \n\t" \
  "pmaddhw    $f8, $f8, $f8                       \n\t" \
  "pmaddhw    $f10, $f10, $f10                    \n\t" \
  "paddw      $f16, $f16, $f4                     \n\t" \
  "paddw      $f18, $f18, $f6                     \n\t" \
  "paddw      $f16, $f16, $f8                     \n\t" \
  "paddw      $f18, $f18, $f10                    \n\t" \
  PTR_ADDU   ""#r0", "#r0", "#r2"                 \n\t" \
  PTR_ADDU   ""#r1", "#r1", "#r2"                 \n\t"

void VAACalcSad_mmi(const uint8_t* pCurData, const uint8_t* pRefData,
                    int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
                    int32_t* pFrameSad, int32_t* pSad8x8) {
  double ftmp[13];
  uint64_t tmp[2];
  mips_reg addr[3];

  __asm__ volatile (
    ".set       arch=loongson3a                                     \n\t"
    PTR_SRL    "%[iPicWidth],   %[iPicWidth],   0x04                \n\t"
    PTR_SRL    "%[iPicHeight],  %[iPicHeight],  0x04                \n\t"
    "move       %[addr2],       %[iPicStride]                       \n\t"
    PTR_SLL    "%[iPicStride],  %[iPicStride],  0x04                \n\t"
    "xor        %[ftmp0],       %[ftmp0],       %[ftmp0]            \n\t"
    "xor        %[ftmp11],      %[ftmp11],      %[ftmp11]           \n\t"
    "xor        %[ftmp12],      %[ftmp12],      %[ftmp12]           \n\t"
    "1:                                                             \n\t"
    "move       %[addr0],       %[pCurData]                         \n\t"
    "move       %[addr1],       %[pRefData]                         \n\t"
    "move       %[tmp0],        %[iPicWidth]                        \n\t"
    "2:                                                             \n\t"
    "xor        %[ftmp9],       %[ftmp9],       %[ftmp9]            \n\t"
    "xor        %[ftmp10],      %[ftmp10],      %[ftmp10]           \n\t"
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    "paddw      %[ftmp11],      %[ftmp11],      %[ftmp9]            \n\t"
    "paddw      %[ftmp12],      %[ftmp12],      %[ftmp10]           \n\t"
    "swc1       %[ftmp10],      0x00(%[pSad8x8])                    \n\t"
    "swc1       %[ftmp9],       0x04(%[pSad8x8])                    \n\t"

    "xor        %[ftmp9],       %[ftmp9],       %[ftmp9]            \n\t"
    "xor        %[ftmp10],      %[ftmp10],      %[ftmp10]           \n\t"
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    WELS_SAD_16x2_MMI(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],
                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],
                      %[addr0], %[addr1], %[addr2])
    "paddw      %[ftmp11],      %[ftmp11],      %[ftmp9]            \n\t"
    "paddw      %[ftmp12],      %[ftmp12],      %[ftmp10]           \n\t"
    "swc1       %[ftmp10],      0x08(%[pSad8x8])                    \n\t"
    "swc1       %[ftmp9],       0x0c(%[pSad8x8])                    \n\t"

    PTR_ADDU   "%[pSad8x8],     %[pSad8x8],     0x10                \n\t"
    PTR_SUBU   "%[addr0],       %[addr0],       %[iPicStride]       \n\t"
    PTR_SUBU   "%[addr1],       %[addr1],       %[iPicStride]       \n\t"
    PTR_ADDI   "%[tmp0],        %[tmp0],        -0x01               \n\t"
    PTR_ADDU   "%[addr0],       %[addr0],       0x10                \n\t"
    PTR_ADDU   "%[addr1],       %[addr1],       0x10                \n\t"
    "bnez       %[tmp0],        2b                                  \n\t"

    PTR_ADDI   "%[iPicHeight],  %[iPicHeight],  -0x01               \n\t"
    PTR_ADDU   "%[pCurData],    %[pCurData],    %[iPicStride]       \n\t"
    PTR_ADDU   "%[pRefData],    %[pRefData],    %[iPicStride]       \n\t"
    "bnez       %[iPicHeight],  1b                                  \n\t"

    "paddw      %[ftmp11],      %[ftmp11],      %[ftmp12]           \n\t"
    "swc1       %[ftmp11],      0x00(%[pFrameSad])                  \n\t"
    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
      [pCurData]"+&r"(pCurData),        [pRefData]"+&r"(pRefData),
      [iPicHeight]"+&r"(iPicHeight),    [iPicWidth]"+&r"(iPicWidth),
      [pSad8x8]"+&r"(pSad8x8),          [iPicStride]"+&r"(iPicStride),
      [addr2]"=&r"(addr[2])
    : [pFrameSad]"r"(pFrameSad)
    : "memory"
  );
}

void VAACalcSadBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
                       int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
                       int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8,
                       uint8_t *p_mad8x8) {
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "move       $15, %[cur_data]                          \n\t"
    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
    "dsll       $13, %[iPicStride], 0x4                   \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "xor        $f2, $f2, $f2                             \n\t"
    "xor        $14, $14, $14                             \n\t"
    "1:                                                   \n\t"
    "move       $9, %[iPicWidth]                          \n\t"
    "move       $10, $15                                  \n\t"
    "move       $11, %[ref_data]                          \n\t"
    "2:                                                   \n\t"
    "xor        $f28, $f28, $f28                          \n\t"
    "xor        $f30, $f30, $f30                          \n\t"
    "xor        $f24, $f24, $f24                          \n\t"
    "xor        $f26, $f26, $f26                          \n\t"
    "xor        $f20, $f20, $f20                          \n\t"
    "xor        $f22, $f22, $f22                          \n\t"
    "xor        $f16, $f16, $f16                          \n\t"
    "xor        $f18, $f18, $f18                          \n\t"
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])

    "dli        $8, 0x1                                   \n\t"
    "dmtc1      $8, $f8                                   \n\t"
    "dli        $8, 0x8                                   \n\t"
    "dmtc1      $8, $f10                                  \n\t"
    WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)

    "dmfc1      $8, $f16                                  \n\t"
    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
    "dmfc1      $8, $f18                                  \n\t"
    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"

    "xor        $f16, $f16, $f16                          \n\t"
    "xor        $f18, $f18, $f18                          \n\t"
    "punpcklwd  $f30, $f30, $f30                          \n\t"
    "punpcklwd  $f26, $f26, $f26                          \n\t"
    "punpcklwd  $f22, $f22, $f22                          \n\t"

    "punpckhwd  $f30, $f28, $f30                          \n\t"
    "punpckhwd  $f26, $f24, $f26                          \n\t"
    "punpckhwd  $f22, $f20, $f22                          \n\t"

    "punpcklwd  $f28, $f16, $f28                          \n\t"
    "punpcklwd  $f24, $f16, $f24                          \n\t"
    "punpcklwd  $f20, $f16, $f20                          \n\t"

    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])
    WELS_SAD_SD_MAD_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16, $f18,
                             $15, %[ref_data], %[iPicStride])

    "dli        $8, 0x1                                   \n\t"
    "dmtc1      $8, $f8                                   \n\t"
    "dli        $8, 0x8                                   \n\t"
    "dmtc1      $8, $f10                                  \n\t"
    WELS_MAX_REG_MMI($f16, $f18, $f8, $f10)

    "dmfc1      $8, $f16                                  \n\t"
    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
    "dmfc1      $8, $f18                                  \n\t"
    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
    "punpckhwd  $f4, $f28, $f30                           \n\t"
    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"

    "punpcklwd  $f6, $f28, $f30                           \n\t"
    "gssqc1     $f6, $f4, 0x0(%[psad8x8])                 \n\t"
    PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x10              \n\t"

    "paddw      $f6, $f6, $f30                            \n\t"
    "paddw      $f4, $f4, $f28                            \n\t"
    "punpckhwd  $f8, $f6, $f6                             \n\t"
    "paddw      $f4, $f4, $f8                             \n\t"
    "dmtc1      $14, $f6                                  \n\t"
    "paddw      $f6, $f6, $f4                             \n\t"
    "dmfc1      $14, $f6                                  \n\t"

    "psubw      $f24, $f24, $f20                          \n\t"
    "psubw      $f26, $f26, $f22                          \n\t"
    "punpckhwd  $f4, $f24, $f26                           \n\t"
    "punpcklwd  $f6, $f24, $f26                           \n\t"
    "gssqc1     $f6, $f4, 0x0(%[p_sd8x8])                 \n\t"
    PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x10              \n\t"

    PTR_SUBU   "$15, $15, $13                             \n\t"
    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"

    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
    "bnez       %[iPicWidth], 2b                          \n\t"
    "move       %[iPicWidth], $9                          \n\t"
    "move       $15, $10                                  \n\t"
    "move       %[ref_data], $11                          \n\t"
    PTR_ADDU   "$15, $15, $13                             \n\t"
    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"

    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
    "bnez       %[iPicHeight], 1b                         \n\t"

    "swl        $14, 0x3(%[psadframe])                    \n\t"
    "swr        $14, 0x0(%[psadframe])                    \n\t"
    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
      [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
      [p_sd8x8]"+&r"((int *)p_sd8x8), [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
      [psadframe]"r"((int *)psadframe)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
      "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void VAACalcSadSsd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
                       int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
                       int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
                       int32_t *psqsum16x16, int32_t *psqdiff16x16) {
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "move       $15, %[cur_data]                          \n\t"
    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
    "dsll       $13, %[iPicStride], 0x4                   \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "xor        $f2, $f2, $f2                             \n\t"
    "xor        $12, $12, $12                             \n\t"
    "xor        $14, $14, $14                             \n\t"
    "1:                                                   \n\t"
    "move       $9, %[iPicWidth]                          \n\t"
    "move       $10, $15                                  \n\t"
    "move       $11, %[ref_data]                          \n\t"
    "2:                                                   \n\t"
    "xor        $f28, $f28, $f28                          \n\t"
    "xor        $f30, $f30, $f30                          \n\t"
    "xor        $f24, $f24, $f24                          \n\t"
    "xor        $f26, $f26, $f26                          \n\t"
    "xor        $f20, $f20, $f20                          \n\t"
    "xor        $f22, $f22, $f22                          \n\t"
    "xor        $f16, $f16, $f16                          \n\t"
    "xor        $f18, $f18, $f18                          \n\t"
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    "dmfc1      $8, $f28                                  \n\t"
    "sw         $8, 0x0(%[psad8x8])                       \n\t"
    "dmfc1      $8, $f30                                  \n\t"
    "sw         $8, 0x4(%[psad8x8])                       \n\t"
    "paddw      $f4, $f28, $f30                           \n\t"
    "dmfc1      $12, $f4                                  \n\t"
	  PTR_ADDU   "$14, $14, $12                             \n\t"

    "xor        $f28, $f28, $f28                          \n\t"
    "xor        $f30, $f30, $f30                          \n\t"
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_MMI($15, %[ref_data], %[iPicStride])
    "dmfc1      $8, $f28                                  \n\t"
    "sw         $8, 0x8(%[psad8x8])                       \n\t"
    "dmfc1      $8, $f30                                  \n\t"
    "paddw      $f4, $f28, $f30                           \n\t"
    "sw         $8, 0xc(%[psad8x8])                       \n\t"
    "dmfc1      $12, $f4                                  \n\t"
	  PTR_ADDU   "$14, $14, $12                             \n\t"
    PTR_ADDIU  "%[psad8x8],   %[psad8x8],   0x10          \n\t"

    "paddw      $f24, $f24, $f26                          \n\t"
    "dmfc1      $8, $f24                                  \n\t"
    "sw         $8, 0x0(%[psum16x16])                     \n\t"
    PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"
    "paddw      $f24, $f20, $f22                          \n\t"
	  "punpcklwd  $f20, $f24, $f24                          \n\t"
	  "punpckhwd  $f22, $f24, $f24                          \n\t"
    "paddw      $f20, $f20, $f22                          \n\t"
    "dmfc1      $8, $f20                                  \n\t"
    "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
    PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"

    "paddw      $f20, $f16, $f18                          \n\t"
	  "punpcklwd  $f16, $f20, $f20                          \n\t"
	  "punpckhwd  $f18, $f20, $f20                          \n\t"
    "paddw      $f16, $f16, $f18                          \n\t"
    "dmfc1      $8, $f16                                  \n\t"
    "sw         $8, 0x0(%[psqdiff16x16])                  \n\t"
    PTR_ADDIU  "%[psqdiff16x16], %[psqdiff16x16], 0x4     \n\t"

    PTR_SUBU   "$15, $15, $13                             \n\t"
    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"

    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
    "bnez       %[iPicWidth], 2b                          \n\t"
    "nop                                                  \n\t"
    "move       %[iPicWidth], $9                          \n\t"
    "move       $15, $10                                  \n\t"
    "move       %[ref_data], $11                          \n\t"
    PTR_ADDU   "$15, $15, $13                             \n\t"
    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"

    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
    "bnez       %[iPicHeight], 1b                         \n\t"
    "nop                                                  \n\t"

    "sw         $14, 0x0(%[psadframe])                    \n\t"
    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
      [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
      [psqsum16x16]"+&r"((int *)psqsum16x16), [psqdiff16x16]"+&r"((int *)psqdiff16x16)
    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
      [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
      "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void VAACalcSadSsdBgd_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
                          int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
                          int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
                          int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8,
                          uint8_t *p_mad8x8) {
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "move       $15, %[cur_data]                          \n\t"
    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
    "dsll       $13, %[iPicStride], 0x4                   \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "xor        $f2, $f2, $f2                             \n\t"
    "xor        $12, $12, $12                             \n\t"
    "xor        $14, $14, $14                             \n\t"
    "1:                                                   \n\t"
    "move       $9, %[iPicWidth]                          \n\t"
    "move       $10, $15                                  \n\t"
    "move       $11, %[ref_data]                          \n\t"
    "2:                                                   \n\t"
    "xor        $f28, $f28, $f28                          \n\t"
    "xor        $f30, $f30, $f30                          \n\t"
    "xor        $f24, $f24, $f24                          \n\t"
    "xor        $f26, $f26, $f26                          \n\t"
    "xor        $f20, $f20, $f20                          \n\t"
    "xor        $f22, $f22, $f22                          \n\t"
    "xor        $f16, $f16, $f16                          \n\t"
    "xor        $f18, $f18, $f18                          \n\t"
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])

    "dmfc1      $8, $f28                                  \n\t"
    "sw         $8, 0x0(%[psad8x8])                       \n\t"
    "dmfc1      $8, $f30                                  \n\t"
    "sw         $8, 0x4(%[psad8x8])                       \n\t"
    PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x8               \n\t"

    "paddw      $f4, $f28, $f30                           \n\t"
    "dmfc1      $12, $f4                                  \n\t"
    PTR_ADDU   "$14, $14,  $12                            \n\t"

    "paddw      $f4, $f24, $f26                           \n\t"
    "dmfc1      $8, $f4                                   \n\t"
    "sw         $8, 0x0(%[psum16x16])                     \n\t"

    "punpckhwd  $f4, $f24, $f26                           \n\t"
    "punpcklwd  $f6, $f24, $f26                           \n\t"
    "psubw      $f6, $f6, $f4                             \n\t"
    "dmfc1      $8, $f6                                   \n\t"
    PTR_S      "$8, 0x0(%[p_sd8x8])                       \n\t"
    PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x8               \n\t"

    "dli        $8, 0x1                                   \n\t"
    "dmtc1      $8, $f8                                   \n\t"
    "dli        $8, 0x8                                   \n\t"
    "dmtc1      $8, $f10                                  \n\t"
    WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)

    "dmfc1      $8, $f20                                  \n\t"
    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
    "dmfc1      $8, $f22                                  \n\t"
    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"

    "xor        $f20, $f20, $f20                          \n\t"
    "xor        $f22, $f22, $f22                          \n\t"
    "punpckhwd  $f28, $f20, $f28                          \n\t"
    "xor        $f24, $f24, $f24                          \n\t"
    "xor        $f26, $f26, $f26                          \n\t"
    "punpckhwd  $f30, $f20, $f30                          \n\t"
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])
    WELS_SAD_BGD_SQDIFF_16x1_MMI($f28, $f30, $f24, $f26, $f20, $f22, $f16,
                                 $f18, $15, %[ref_data], %[iPicStride])

    "dmfc1      $8, $f28                                  \n\t"
    "sw         $8, 0x0(%[psad8x8])                       \n\t"
    "dmfc1      $8, $f30                                  \n\t"
    "sw         $8, 0x4(%[psad8x8])                       \n\t"
    PTR_ADDIU  "%[psad8x8], %[psad8x8], 0x8               \n\t"

    "paddw      $f4, $f28, $f30                           \n\t"
    "dmfc1      $12, $f4                                  \n\t"
    PTR_ADDU   "$14, $14, $12                             \n\t"

    "paddw      $f4, $f24, $f26                           \n\t"
    "dmfc1      $8, $f4                                   \n\t"
    "lw         $12, 0x0(%[psum16x16])                    \n\t"
    PTR_ADDU   "$8, $8, $12                               \n\t"
    "sw         $8, 0x0(%[psum16x16])                     \n\t"
    "xor        $f8, $f8, $f8                             \n\t"
    PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"

    "punpckhwd  $f30, $f30, $f8                           \n\t"
    "punpckhwd  $f28, $f28, $f8                           \n\t"
    "paddw      $f8, $f28, $f30                           \n\t"
    "dmfc1      $8, $f8                                   \n\t"
    "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
    PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"

    "punpckhwd  $f4, $f24, $f26                           \n\t"
    "punpcklwd  $f6, $f24, $f26                           \n\t"
    "psubw      $f6, $f6, $f4                             \n\t"
    "dmfc1      $8, $f6                                   \n\t"
    PTR_S      "$8, 0x0(%[p_sd8x8])                       \n\t"
    PTR_ADDIU  "%[p_sd8x8], %[p_sd8x8], 0x8               \n\t"

    "dli        $8, 0x1                                   \n\t"
    "dmtc1      $8, $f8                                   \n\t"
    "dli        $8, 0x8                                   \n\t"
    "dmtc1      $8, $f10                                  \n\t"
    WELS_MAX_REG_MMI($f20, $f22, $f8, $f10)

    "dmfc1      $8, $f20                                  \n\t"
    "sb         $8, 0x0(%[p_mad8x8])                      \n\t"
    "dmfc1      $8, $f22                                  \n\t"
    "sb         $8, 0x1(%[p_mad8x8])                      \n\t"
    PTR_ADDIU  "%[p_mad8x8], %[p_mad8x8], 0x2             \n\t"

    "paddw      $f20, $f16, $f18                          \n\t"
	  "punpcklwd  $f16, $f20, $f20                          \n\t"
	  "punpckhwd  $f18, $f20, $f20                          \n\t"
    "paddw      $f16, $f16, $f18                          \n\t"
    "dmfc1      $8, $f16                                  \n\t"
    "sw         $8, 0x0(%[psqdiff16x16])                  \n\t"
    PTR_ADDIU  "%[psqdiff16x16], %[psqdiff16x16], 0x4     \n\t"

    PTR_SUBU   "$15, $15, $13                             \n\t"
    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"

    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
    "bnez       %[iPicWidth], 2b                          \n\t"
    "nop                                                  \n\t"
    "move       %[iPicWidth], $9                          \n\t"
    "move       $15, $10                                  \n\t"
    "move       %[ref_data], $11                          \n\t"
    PTR_ADDU   "$15, $15, $13                             \n\t"
    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"

    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
    "bnez       %[iPicHeight], 1b                         \n\t"
    "nop                                                  \n\t"

    "sw         $14, 0x0(%[psadframe])                    \n\t"
    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
      [iPicHeight]"+&r"((int)iPicHeight), [psad8x8]"+&r"((int *)psad8x8),
      [psum16x16]"+&r"((int *)psum16x16), [psqsum16x16]"+&r"((int *)psqsum16x16),
	    [psqdiff16x16]"+&r"((int *)psqdiff16x16), [p_sd8x8]"+&r"((int *)p_sd8x8),
      [p_mad8x8]"+&r"((unsigned char *)p_mad8x8)
    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
      [psadframe]"r"((int *)psadframe)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
      "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void VAACalcSadVar_mmi(const uint8_t *cur_data, const uint8_t *ref_data,
                       int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
                       int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16,
                       int32_t *psqsum16x16) {
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "move       $15, %[cur_data]                          \n\t"
    "dsrl       %[iPicWidth], %[iPicWidth], 0x4           \n\t"
    "dsrl       %[iPicHeight], %[iPicHeight], 0x4         \n\t"
    "dsll       $13, %[iPicStride], 0x4                   \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "xor        $f2, $f2, $f2                             \n\t"
    "xor        $f28, $f28, $f28                          \n\t"
    "xor        $f30, $f30, $f30                          \n\t"
    "xor        $14, $14, $14                             \n\t"
    "1:                                                   \n\t"
    "move       $9, %[iPicWidth]                          \n\t"
    "move       $10, $15                                  \n\t"
    "move       $11, %[ref_data]                          \n\t"
    "2:                                                   \n\t"
    "xor        $f24, $f24, $f24                          \n\t"
    "xor        $f26, $f26, $f26                          \n\t"
    "xor        $f20, $f20, $f20                          \n\t"
    "xor        $f22, $f22, $f22                          \n\t"
    "xor        $f16, $f16, $f16                          \n\t"
    "xor        $f18, $f18, $f18                          \n\t"
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    "paddw      $f28, $f24, $f28                          \n\t"
    "paddw      $f30, $f26, $f30                          \n\t"
    "dmfc1      $8, $f24                                  \n\t"
    "sw         $8, 0x0(%[psad8x8])                       \n\t"
    "dmfc1      $8, $f26                                  \n\t"
    "sw         $8, 0x4(%[psad8x8])                       \n\t"

    "xor        $f24, $f24, $f24                          \n\t"
    "xor        $f26, $f26, $f26                          \n\t"
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    WELS_SAD_SUM_SQSUM_16x1_MMI($15, %[ref_data], %[iPicStride])
    "paddw      $f28, $f24, $f28                          \n\t"
    "paddw      $f30, $f26, $f30                          \n\t"
    "dmfc1      $8, $f24                                  \n\t"
    "sw         $8, 0x8(%[psad8x8])                       \n\t"
    "dmfc1      $8, $f26                                  \n\t"
    "sw         $8, 0xc(%[psad8x8])                       \n\t"
    PTR_ADDIU  "%[psad8x8],   %[psad8x8],   0x10          \n\t"

    "paddw      $f20, $f20, $f22                          \n\t"
    "dmfc1      $8, $f20                                  \n\t"
    "sw         $8, 0x0(%[psum16x16])                     \n\t"
    PTR_ADDIU  "%[psum16x16], %[psum16x16], 0x4           \n\t"

    "paddw      $f20, $f16, $f18                          \n\t"
	  "punpcklwd  $f16, $f20, $f20                          \n\t"
	  "punpckhwd  $f18, $f20, $f20                          \n\t"
    "paddw      $f16, $f16, $f18                          \n\t"
    "dmfc1      $8, $f16                                  \n\t"
    "sw         $8, 0x0(%[psqsum16x16])                   \n\t"
    PTR_ADDIU  "%[psqsum16x16], %[psqsum16x16], 0x4       \n\t"

    PTR_SUBU   "$15, $15, $13                             \n\t"
    PTR_SUBU   "%[ref_data], %[ref_data], $13             \n\t"
    PTR_ADDIU  "$15, $15, 0x10                            \n\t"
    PTR_ADDIU  "%[ref_data], %[ref_data], 0x10            \n\t"

    PTR_ADDIU  "%[iPicWidth], %[iPicWidth], -0x1          \n\t"
    "bnez       %[iPicWidth], 2b                          \n\t"
    "nop                                                  \n\t"
    "move       %[iPicWidth], $9                          \n\t"
    "move       $15, $10                                  \n\t"
    "move       %[ref_data], $11                          \n\t"
    PTR_ADDU   "$15, $15, $13                             \n\t"
    PTR_ADDU   "%[ref_data], %[ref_data], $13             \n\t"

    PTR_ADDIU  "%[iPicHeight], %[iPicHeight], -0x1        \n\t"
    "bnez       %[iPicHeight], 1b                         \n\t"
    "nop                                                  \n\t"

    "paddw      $f28, $f28, $f30                          \n\t"
    "dmfc1      $8, $f28                                  \n\t"
    "sw         $8, 0x0(%[psadframe])                     \n\t"
    : [ref_data]"+&r"((unsigned char *)ref_data), [iPicWidth]"+&r"((int)iPicWidth),
      [iPicHeight]"+&r"((int)iPicHeight), [psum16x16]"+&r"((int *)psum16x16),
      [psqsum16x16]"+&r"((int *)psqsum16x16)
    : [cur_data]"r"((unsigned char *)cur_data), [iPicStride]"r"((int)iPicStride),
      [psadframe]"r"((int *)psadframe), [psad8x8]"r"((int *)psad8x8)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", "$f2",
      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
      "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}