shithub: openh264

ref: 7a1edbafd3c96d21696bb655bd657da6efbca450
dir: /codec/encoder/core/mips/quant_mmi.c/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2018, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file    quant_mmi.c
 *
 * \brief   Loongson optimization
 *
 * \date    20/07/2018 Created
 *
 *************************************************************************************
 */
#include <stdint.h>
#include "asmdefs_mmi.h"

void WelsQuant4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
  __asm__ volatile (
    ".set       arch=loongson3a                 \n\t"
    "xor        $f10, $f10, $f10                \n\t"
    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"

    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
   :
   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
  );
}

void WelsQuant4x4Dc_mmi(int16_t *pDct, const int16_t ff, int16_t mf) {
  __asm__ volatile (
    ".set       arch=loongson3a                 \n\t"
    "xor        $f10, $f10, $f10                \n\t"
    "dmtc1      %[mf], $f12                     \n\t"
    "pshufh     $f12, $f12, $f10                \n\t"

    "dmtc1      %[ff], $f8                      \n\t"
    "pshufh     $f8, $f8, $f10                  \n\t"

    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f8                   \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f12                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f8                   \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f12                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
   :
   : [pDct]"r"((short *)pDct), [ff]"r"((short)ff), [mf]"r"((short)mf)
   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
  );
}

void WelsQuantFour4x4_mmi(int16_t *pDct, const int16_t* ff, const int16_t *mf) {
  __asm__ volatile (
    ".set       arch=loongson3a                 \n\t"
    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"

    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
   :
   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf)
   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
  );
}

void WelsQuantFour4x4Max_mmi(int16_t *pDct, const int16_t*ff,
                             const int16_t *mf, int16_t *max) {
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                 \n\t"
    "gslqc1     $f10, $f8, 0x0(%[ff])           \n\t"
    "gslqc1     $f14, $f12, 0x0(%[mf])          \n\t"

    "xor        $f16, $f16, $f16                \n\t"
    "xor        $f18, $f18, $f18                \n\t"
    "xor        $f20, $f20, $f20                \n\t"
    "xor        $f22, $f22, $f22                \n\t"
    "xor        $f24, $f24, $f24                \n\t"
    "xor        $f26, $f26, $f26                \n\t"
    "xor        $f28, $f28, $f28                \n\t"
    "xor        $f30, $f30, $f30                \n\t"

    "gslqc1     $f2, $f0, 0x0(%[pDct])          \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f16, $f16, $f0                 \n\t"
    "pmaxsh     $f18, $f18, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x0(%[pDct])          \n\t"

    "gslqc1     $f2, $f0, 0x10(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f16, $f16, $f0                 \n\t"
    "pmaxsh     $f18, $f18, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x10(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x20(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f20, $f20, $f0                 \n\t"
    "pmaxsh     $f22, $f22, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x20(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x30(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f20, $f20, $f0                 \n\t"
    "pmaxsh     $f22, $f22, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x30(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x40(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f24, $f24, $f0                 \n\t"
    "pmaxsh     $f26, $f26, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x40(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x50(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f24, $f24, $f0                 \n\t"
    "pmaxsh     $f26, $f26, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x50(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x60(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f28, $f28, $f0                 \n\t"
    "pmaxsh     $f30, $f30, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x60(%[pDct])         \n\t"

    "gslqc1     $f2, $f0, 0x70(%[pDct])         \n\t"
    "xor        $f4, $f4, $f4                   \n\t"
    "xor        $f6, $f6, $f6                   \n\t"
    "pcmpgth    $f4, $f4, $f0                   \n\t"
    "pcmpgth    $f6, $f6, $f2                   \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "paddush    $f0, $f0, $f8                   \n\t"
    "paddush    $f2, $f2, $f10                  \n\t"
    "pmulhuh    $f0, $f0, $f12                  \n\t"
    "pmulhuh    $f2, $f2, $f14                  \n\t"
    "pmaxsh     $f28, $f28, $f0                 \n\t"
    "pmaxsh     $f30, $f30, $f2                 \n\t"
    "xor        $f0, $f0, $f4                   \n\t"
    "xor        $f2, $f2, $f6                   \n\t"
    "psubh      $f0, $f0, $f4                   \n\t"
    "psubh      $f2, $f2, $f6                   \n\t"
    "gssqc1     $f2, $f0, 0x70(%[pDct])         \n\t"

    "mov.d      $f0, $f18                       \n\t"
    "punpckhhw  $f18, $f16, $f20                \n\t"
    "punpcklhw  $f16, $f16, $f20                \n\t"
    "punpckhhw  $f2, $f0, $f22                  \n\t"
    "punpcklhw  $f0, $f0, $f22                  \n\t"

    "mov.d      $f20, $f26                      \n\t"
    "punpckhhw  $f26, $f24, $f28                \n\t"
    "punpcklhw  $f24, $f24, $f28                \n\t"
    "punpckhhw  $f22, $f20, $f30                \n\t"
    "punpcklhw  $f20, $f20, $f30                \n\t"

    "mov.d      $f28, $f18                      \n\t"
    "punpckhwd  $f18, $f16, $f24                \n\t"
    "punpcklwd  $f16, $f16, $f24                \n\t"
    "punpckhwd  $f30, $f28, $f26                \n\t"
    "punpcklwd  $f28, $f28, $f26                \n\t"

    "mov.d      $f24, $f2                       \n\t"
    "punpckhwd  $f2, $f0, $f20                  \n\t"
    "punpcklwd  $f0, $f0, $f20                  \n\t"
    "punpckhwd  $f26, $f24, $f22                \n\t"
    "punpcklwd  $f24, $f24, $f22                \n\t"

    "mov.d      $f20, $f18                      \n\t"
    "mov.d      $f18, $f0                       \n\t"
    "mov.d      $f22, $f2                       \n\t"

    "mov.d      $f0, $f30                       \n\t"
    "mov.d      $f30, $f24                      \n\t"
    "mov.d      $f2, $f26                       \n\t"

    "pmaxsh     $f0, $f0, $f16                  \n\t"
    "pmaxsh     $f2, $f2, $f18                  \n\t"

    "pmaxsh     $f0, $f0, $f20                  \n\t"
    "pmaxsh     $f2, $f2, $f22                  \n\t"

    "pmaxsh     $f0, $f0, $f28                  \n\t"
    "pmaxsh     $f2, $f2, $f30                  \n\t"

    "mov.d      $f4, $f0                        \n\t"
    "mov.d      $f6, $f2                        \n\t"

    "mov.d      $f0, $f2                        \n\t"
    "mov.d      $f2, $f6                        \n\t"

    "pmaxsh     $f0, $f0, $f4                   \n\t"
    "pmaxsh     $f2, $f2, $f6                   \n\t"

    "gssdlc1    $f0, 0x7(%[max])                \n\t"
    "gssdrc1    $f0, 0x0(%[max])                \n\t"
   :
   : [pDct]"r"((short *)pDct), [ff]"r"((short *)ff), [mf]"r"((short *)mf),
     [max]"r"((short *)max)
   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14",
     "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}