shithub: libvpx

ref: d4453c73ff9467d5c7cd4ce8b6070dbfa24eff37
dir: /vp8/encoder/mips/mmi/dct_mmi.c/

View raw version
/*
 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "./vp8_rtcd.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/asmdefs_mmi.h"

/* clang-format off */
/* TRANSPOSE_4H: transpose 4x4 matrix.
   Input: ftmp1,ftmp2,ftmp3,ftmp4
   Output: ftmp1,ftmp2,ftmp3,ftmp4
   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
 */
#define TRANSPOSE_4H                                         \
  MMI_LI(%[tmp0], 0x93)                                      \
  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
  "por        %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
  "por        %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
  "por        %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
  "por        %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
/* clang-format on */

void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
  uint64_t tmp[1];
  int16_t *ip = input;
  double ff_ph_op1, ff_ph_op3;

#if _MIPS_SIM == _ABIO32
  register double ftmp0 asm("$f0");
  register double ftmp1 asm("$f2");
  register double ftmp2 asm("$f4");
  register double ftmp3 asm("$f6");
  register double ftmp4 asm("$f8");
  register double ftmp5 asm("$f10");
  register double ftmp6 asm("$f12");
  register double ftmp7 asm("$f14");
  register double ftmp8 asm("$f16");
  register double ftmp9 asm("$f18");
  register double ftmp10 asm("$f20");
  register double ftmp11 asm("$f22");
  register double ftmp12 asm("$f24");
#else
  register double ftmp0 asm("$f0");
  register double ftmp1 asm("$f1");
  register double ftmp2 asm("$f2");
  register double ftmp3 asm("$f3");
  register double ftmp4 asm("$f4");
  register double ftmp5 asm("$f5");
  register double ftmp6 asm("$f6");
  register double ftmp7 asm("$f7");
  register double ftmp8 asm("$f8");
  register double ftmp9 asm("$f9");
  register double ftmp10 asm("$f10");
  register double ftmp11 asm("$f11");
  register double ftmp12 asm("$f12");
#endif  // _MIPS_SIM == _ABIO32

  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };

  /* clang-format off */
  __asm__ volatile (
    "dli        %[tmp0],    0x14e808a914e808a9              \n\t"
    "dmtc1      %[tmp0],    %[ff_ph_op1]                    \n\t"
    "dli        %[tmp0],    0xeb1808a9eb1808a9              \n\t"
    "dmtc1      %[tmp0],    %[ff_ph_op3]                    \n\t"
    "pxor       %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    TRANSPOSE_4H

    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
    // f1 + f4
    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
    // a1
    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
    // f2 + f3
    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
    // b1
    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
    // f2 - f3
    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
    // c1
    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
    // f1 - f4
    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
    // d1
    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
    // op[0] = a1 + b1
    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
    // op[2] = a1 - b1
    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"

    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
    MMI_LI(%[tmp0], 0x0c)
    "dmtc1      %[tmp0],    %[ftmp11]                       \n\t"
    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"

    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
    TRANSPOSE_4H

    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"

    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"

    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
    MMI_LI(%[tmp0], 0x04)
    "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"

    MMI_LI(%[tmp0], 0x10)
    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"

    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"

    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"

    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
      [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
    : "memory"
  );
  /* clang-format on */
}

void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
  vp8_short_fdct4x4_mmi(input, output, pitch);
  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
}

void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
  double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
  uint64_t tmp[1];

  /* clang-format off */
  __asm__ volatile (
    "dli        %[tmp0],    0x0001000100010001                  \n\t"
    "dmtc1      %[tmp0],    %[ff_ph_01]                         \n\t"
    "dli        %[tmp0],    0x0000000100000001                  \n\t"
    "dmtc1      %[tmp0],    %[ff_pw_01]                         \n\t"
    "dli        %[tmp0],    0x0000000300000003                  \n\t"
    "dmtc1      %[tmp0],    %[ff_pw_03]                         \n\t"
    "dli        %[tmp0],    0x0001000000010000                  \n\t"
    "dmtc1      %[tmp0],    %[ff_pw_mask]                       \n\t"
    MMI_LI(%[tmp0], 0x02)
    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
    MMI_ADDU(%[ip], %[ip], %[pitch])
    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
    TRANSPOSE_4H

    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
    // a
    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
    // d
    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
    // c
    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
    // b
    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"

    // a + d
    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
    // b + c
    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
    // b - c
    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
    // a - d
    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"

    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
    TRANSPOSE_4H

    // op[2], op[0]
    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
    // op[3], op[1]
    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"

    // op[6], op[4]
    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
    // op[7], op[5]
    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"

    // op[10], op[8]
    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
    // op[11], op[9]
    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"

    // op[14], op[12]
    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
    // op[15], op[13]
    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"

    // a1, a3
    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
    // d1, d3
    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
    // c1, c3
    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
    // b1, b3
    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"

    // a1 + d1, a3 + d3
    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
    // b1 + c1, b3 + c3
    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
    // b1 - c1, b3 - c3
    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
    // a1 - d1, a3 - d3
    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"

    // a2, a4
    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
    // d2, d4
    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
    // c2, c4
    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
    // b2, b4
    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"

    // a2 + d2, a4 + d4
    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
    // b2 + c2, b4 + c4
    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
    // b2 - c2, b4 - c4
    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
    // a2 - d2, a4 - d4
    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"

    MMI_LI(%[tmp0], 0x03)
    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"

    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"

    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"

    MMI_LI(%[tmp0], 0x72)
    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"

    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
      [ftmp12]"=&f"(ftmp[12]),          [ff_pw_mask]"=&f"(ff_pw_mask),
      [tmp0]"=&r"(tmp[0]),              [ff_pw_01]"=&f"(ff_pw_01),
      [ip]"+&r"(input),                 [ff_pw_03]"=&f"(ff_pw_03),
      [ff_ph_01]"=&f"(ff_ph_01)
    : [op]"r"(output),                  [pitch]"r"((mips_reg)pitch)
    : "memory"
  );
  /* clang-format on */
}