shithub: openh264

ref: 12464197a58786e59a2f022227372012d5cebd4b
dir: /codec/common/mips/deblock_mmi.c/

View raw version
/*!
 * \copy
 *     Copyright (c)  2009-2018, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 *
 * \file    deblock_mmi.c
 *
 * \brief   Loongson optimize
 *
 * \date    20/07/2018 Created
 *
 *************************************************************************************
 */
#include <stdint.h>
#include "asmdefs_mmi.h"

void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
                         int32_t iBeta, int8_t *pTC) {
  unsigned char tmp[512] __attribute__((aligned(32)));
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "dsll       $8, %[iStride], 0x1                       \n\t"
    "daddu      $8, $8, %[iStride]                        \n\t"
    "dsubu      $14, %[pPix], $8                          \n\t"

    "dsll       $8, %[iStride], 0x1                       \n\t"
    "dsubu      $9, %[pPix], $8                           \n\t"

    "dmtc1      %[iAlpha], $f0                            \n\t"
    "dsubu      $13, %[pPix], %[iStride]                  \n\t"
    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
    "daddu      $12, $8, %[pPix]                          \n\t"

    "punpcklhw  $f0, $f0, $f0                             \n\t"
    "lb         $8, 0x0(%[pTC])                           \n\t"
    "punpcklwd  $f0, $f0, $f0                             \n\t"
    "mov.d      $f2, $f0                                  \n\t"
    "gssqc1     $f2, $f0, 432-112(%[tmp])                 \n\t"
    "dmtc1      %[iBeta], $f0                             \n\t"
    "lb         %[iAlpha], 0x1(%[pTC])                    \n\t"
    "dli        %[iBeta], 0xFFFF                          \n\t"
    "punpcklhw  $f0, $f0, $f0                             \n\t"
    "and        $10, %[iAlpha], %[iBeta]                  \n\t"
    "punpcklwd  $f0, $f0, $f0                             \n\t"
    "mov.d      $f2, $f0                                  \n\t"
    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
    "dmtc1      $10, $f4                                  \n\t"
    "mov.d      $f8, $f4                                  \n\t"
    "dmtc1      %[iAlpha], $f16                           \n\t"
    "and        %[iAlpha], $8, %[iBeta]                   \n\t"
    "dmtc1      %[iAlpha], $f20                           \n\t"
    "mov.d      $f24, $f20                                \n\t"
    "mov.d      $f28, $f20                                \n\t"
    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
    "dmtc1      %[iAlpha], $f0                            \n\t"

    "lb         %[iAlpha], 0x3(%[pTC])                    \n\t"
    "lb         %[pTC], 0x2(%[pTC])                       \n\t"
    "dmtc1      $10, $f12                                 \n\t"
    "punpcklhw  $f0, $f0, $f16                            \n\t"
    "and        $8, %[iAlpha], %[iBeta]                   \n\t"
    "punpcklhw  $f24, $f24, $f8                           \n\t"
    "punpcklhw  $f20, $f20, $f4                           \n\t"
    "punpcklhw  $f0, $f0, $f24                            \n\t"
    "punpcklhw  $f28, $f28, $f12                          \n\t"
    "punpcklhw  $f28, $f28, $f20                          \n\t"
    "punpckhhw  $f2, $f0, $f28                            \n\t"
    "punpcklhw  $f0, $f0, $f28                            \n\t"
    "gssqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
    "dmtc1      $8, $f0                                   \n\t"
    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
    "mov.d      $f8, $f0                                  \n\t"
    "dmtc1      %[iAlpha], $f16                           \n\t"
    "and        %[iAlpha], %[pTC], %[iBeta]               \n\t"
    "dmtc1      $8, $f12                                  \n\t"
    "dmtc1      %[iAlpha], $f20                           \n\t"
    "punpcklhw  $f20, $f20, $f0                           \n\t"

    "xor        $f0, $f0, $f0                             \n\t"
    "dmtc1      %[iAlpha], $f24                           \n\t"
    "and        %[pTC], %[pTC], %[iBeta]                  \n\t"
    "punpcklhw  $f24, $f24, $f8                           \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "dmtc1      %[pTC], $f4                               \n\t"

    "gslqc1     $f10, $f8, 0x0($9)                        \n\t"
    "punpckhbh  $f10, $f8, $f0                            \n\t"
    "punpcklbh  $f8, $f8, $f0                             \n\t"

    "dli        %[iAlpha], 0x4                            \n\t"
    "seh        %[pTC], %[iAlpha]                         \n\t"
    "punpcklhw  $f28, $f28, $f12                          \n\t"
    "punpcklhw  $f28, $f28, $f20                          \n\t"
    "gslqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
    "gslqc1     $f14, $f12, 0x0($13)                      \n\t"
    "gsldxc1    $f2, 0x0($12, $0)                         \n\t"
    "punpckhbh  $f22, $f20, $f0                           \n\t"
    "punpcklbh  $f20, $f20, $f0                           \n\t"
    "gssqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
    "punpckhbh  $f22, $f2, $f0                            \n\t"
    "punpcklbh  $f20, $f2, $f0                            \n\t"
    "gssqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
    "punpcklhw  $f4, $f4, $f16                            \n\t"
    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
    "punpcklhw  $f4, $f4, $f24                            \n\t"
    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
    "punpckhhw  $f6, $f4, $f28                            \n\t"
    "punpcklhw  $f4, $f4, $f28                            \n\t"
    "punpckhbh  $f26, $f24, $f0                           \n\t"
    "punpcklbh  $f24, $f24, $f0                           \n\t"
    "punpckhbh  $f14, $f12, $f0                           \n\t"
    "punpcklbh  $f12, $f12, $f0                           \n\t"
    "punpckhbh  $f18, $f16, $f0                           \n\t"
    "punpcklbh  $f16, $f16, $f0                           \n\t"
    "psubh      $f28, $f12, $f16                          \n\t"
    "psubh      $f30, $f14, $f18                          \n\t"
    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
    "gslqc1     $f18, $f16, 432-336(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
    "pcmpgth    $f20, $f16, $f28                          \n\t"
    "pcmpgth    $f22, $f18, $f30                          \n\t"
    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
    "psubh      $f28, $f24, $f0                           \n\t"
    "psubh      $f30, $f26, $f2                           \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
    "pcmpgth    $f20, $f16, $f28                          \n\t"
    "pcmpgth    $f22, $f18, $f30                          \n\t"
    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
    "pavgh      $f20, $f12, $f24                          \n\t"
    "pavgh      $f22, $f14, $f26                          \n\t"
    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 432-256(%[tmp])                 \n\t"
    "psubh      $f20, $f20, $f28                          \n\t"
    "psubh      $f22, $f22, $f30                          \n\t"
    "psubh      $f20, $f20, $f0                           \n\t"
    "psubh      $f22, $f22, $f2                           \n\t"
    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
    "psubh      $f20, $f24, $f12                          \n\t"
    "psubh      $f22, $f26, $f14                          \n\t"
    "gssqc1     $f26, $f24, 432-32(%[tmp])                \n\t"
    "psubh      $f24, $f24, $f0                           \n\t"
    "psubh      $f26, $f26, $f2                           \n\t"
    "gssqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
    WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
    "pcmpgth    $f20, $f20, $f28                          \n\t"
    "pcmpgth    $f22, $f22, $f30                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
    "pcmpgth    $f28, $f16, $f24                          \n\t"
    "pcmpgth    $f30, $f18, $f26                          \n\t"

    "xor        $f0, $f0, $f0                             \n\t"
    "and        $f20, $f20, $f28                          \n\t"
    "and        $f22, $f22, $f30                          \n\t"
    "psubh      $f24, $f12, $f8                           \n\t"
    "psubh      $f26, $f14, $f10                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
    "pcmpgth    $f28, $f16, $f24                          \n\t"
    "pcmpgth    $f30, $f18, $f26                          \n\t"
    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
    "and        $f20, $f20, $f28                          \n\t"
    "and        $f22, $f22, $f30                          \n\t"
    "pcmpgth    $f28, $f24, $f0                           \n\t"
    "pcmpgth    $f30, $f26, $f0                           \n\t"
    "pcmpeqh    $f24, $f24, $f0                           \n\t"
    "pcmpeqh    $f26, $f26, $f0                           \n\t"
    "or         $f28, $f28, $f24                          \n\t"
    "or         $f30, $f30, $f26                          \n\t"
    "and        $f20, $f20, $f28                          \n\t"
    "and        $f22, $f22, $f30                          \n\t"
    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
    "dmtc1      %[pTC], $f20                              \n\t"
    "punpckhhw  $f26, $f20, $f20                          \n\t"
    "punpcklhw  $f24, $f20, $f20                          \n\t"
    "punpcklwd  $f20, $f24, $f24                          \n\t"
    "mov.d      $f22, $f20                                \n\t"
    "gssqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
    "psubh      $f24, $f0, $f20                           \n\t"
    "dli        $11, 0x2                                  \n\t"
    "psubh      $f26, $f0, $f22                           \n\t"
    "dmtc1      $11, $f28                                 \n\t"
    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
    "psllh      $f20, $f20, $f28                          \n\t"
    "psllh      $f22, $f22, $f28                          \n\t"
    "psubh      $f28, $f8, $f0                            \n\t"
    "psubh      $f30, $f10, $f2                           \n\t"
    "paddh      $f28, $f28, $f20                          \n\t"
    "paddh      $f30, $f30, $f22                          \n\t"
    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
    "paddh      $f28, $f28, $f20                          \n\t"
    "paddh      $f30, $f30, $f22                          \n\t"
    "dli        $11, 0x3                                  \n\t"
    "dmtc1      $11, $f20                                 \n\t"
    "psrah      $f28, $f28, $f20                          \n\t"
    "psrah      $f30, $f30, $f20                          \n\t"
    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
    "pmaxsh     $f24, $f24, $f28                          \n\t"
    "pmaxsh     $f26, $f26, $f30                          \n\t"
    "gslqc1     $f2, $f0, 432-320(%[tmp])                 \n\t"
    "pminsh     $f20, $f20, $f24                          \n\t"
    "pminsh     $f22, $f22, $f26                          \n\t"

    "and        $f20, $f20, $f0                           \n\t"
    "and        $f22, $f22, $f2                           \n\t"
    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
    "gssqc1     $f22, $f20, 432-64(%[tmp])                \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "gssqc1     $f26, $f24, 432-384(%[tmp])               \n\t"
    "psubh      $f20, $f0, $f24                           \n\t"
    "psubh      $f22, $f0, $f26                           \n\t"
    "gssqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
    "mov.d      $f24, $f20                                \n\t"
    "mov.d      $f26, $f22                                \n\t"
    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
    "paddh      $f20, $f20, $f28                          \n\t"
    "paddh      $f22, $f22, $f30                          \n\t"
    "paddh      $f28, $f8, $f8                            \n\t"
    "paddh      $f30, $f10, $f10                          \n\t"
    "psubh      $f20, $f20, $f28                          \n\t"
    "psubh      $f22, $f22, $f30                          \n\t"
    "dli        $11, 0x1                                  \n\t"
    "dmtc1      $11, $f28                                 \n\t"
    "psrah      $f20, $f20, $f28                          \n\t"
    "psrah      $f22, $f22, $f28                          \n\t"
    "pmaxsh     $f24, $f24, $f20                          \n\t"
    "pmaxsh     $f26, $f26, $f22                          \n\t"
    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
    "pminsh     $f20, $f20, $f24                          \n\t"
    "pminsh     $f22, $f22, $f26                          \n\t"

    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
    "and        $f20, $f20, $f24                          \n\t"
    "and        $f22, $f22, $f26                          \n\t"
    "and        $f20, $f20, $f28                          \n\t"
    "and        $f22, $f22, $f30                          \n\t"
    "gslqc1     $f26, $f24, 432-240(%[tmp])               \n\t"
    "gssqc1     $f22, $f20, 432-96(%[tmp])                \n\t"
    "gslqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
    "paddh      $f20, $f20, $f28                          \n\t"
    "paddh      $f22, $f22, $f30                          \n\t"
    "paddh      $f28, $f24, $f24                          \n\t"
    "paddh      $f30, $f26, $f26                          \n\t"
    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
    "dli        $11, 0x1                                  \n\t"
    "psubh      $f20, $f20, $f28                          \n\t"
    "dmtc1      $11, $f28                                 \n\t"
    "psubh      $f22, $f22, $f30                          \n\t"

    "psrah      $f20, $f20, $f28                          \n\t"
    "psrah      $f22, $f22, $f28                          \n\t"
    "gslqc1     $f30, $f28, 0x0(%[iStride])               \n\t"
    "pmaxsh     $f24, $f24, $f20                          \n\t"
    "pmaxsh     $f26, $f26, $f22                          \n\t"
    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
    "pminsh     $f20, $f20, $f24                          \n\t"
    "pminsh     $f22, $f22, $f26                          \n\t"
    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
    "and        $f20, $f20, $f24                          \n\t"
    "and        $f22, $f22, $f26                          \n\t"
    "gslqc1     $f26, $f24, 432-256(%[tmp])               \n\t"
    "and        $f20, $f20, $f24                          \n\t"
    "and        $f22, $f22, $f26                          \n\t"
    "gslqc1     $f26, $f24, 0x0($9)                       \n\t"
    "punpcklbh  $f28, $f30, $f0                           \n\t"
    "punpckhbh  $f30, $f30, $f0                           \n\t"
    "gssqc1     $f30, $f28, 432-352(%[tmp])               \n\t"

    "gslqc1     $f30, $f28, 0x0($12)                      \n\t"
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "gssqc1     $f22, $f20, 432-48(%[tmp])                \n\t"
    "gslqc1     $f22, $f20, 0x0($14)                      \n\t"
    "gssqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 0x0($13)                      \n\t"
    "punpcklbh  $f28, $f30, $f0                           \n\t"
    "punpckhbh  $f30, $f30, $f0                           \n\t"
    "punpcklbh  $f20, $f22, $f0                           \n\t"
    "punpckhbh  $f22, $f22, $f0                           \n\t"
    "gssqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "gssqc1     $f26, $f24, 432-400(%[tmp])               \n\t"

    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
    "psubh      $f28, $f28, $f20                          \n\t"
    "psubh      $f30, $f30, $f22                          \n\t"
    "gssqc1     $f22, $f20, 432-16(%[tmp])                \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "pcmpgth    $f20, $f16, $f28                          \n\t"
    "pcmpgth    $f22, $f18, $f30                          \n\t"
    "gslqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"

    "psubh      $f28, $f24, $f28                          \n\t"
    "psubh      $f30, $f26, $f30                          \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
    "pcmpgth    $f20, $f16, $f28                          \n\t"
    "pcmpgth    $f22, $f18, $f30                          \n\t"
    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"

    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
    "gssqc1     $f26, $f24, 432-80(%[tmp])                \n\t"
    "pavgh      $f20, $f20, $f24                          \n\t"
    "pavgh      $f22, $f22, $f26                          \n\t"
    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"

    "gslqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 432-256(%[tmp])               \n\t"
    "psubh      $f20, $f4, $f20                           \n\t"
    "psubh      $f22, $f6, $f22                           \n\t"
    "psubh      $f20, $f20, $f28                          \n\t"
    "psubh      $f22, $f22, $f30                          \n\t"
    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
    "psubh      $f20, $f24, $f20                          \n\t"
    "psubh      $f22, $f26, $f22                          \n\t"
    "psubh      $f24, $f24, $f28                          \n\t"
    "psubh      $f26, $f26, $f30                          \n\t"
    "gssqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
    "mov.d      $f28, $f20                                \n\t"
    "mov.d      $f30, $f22                                \n\t"
    WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
    "pcmpgth    $f20, $f20, $f28                          \n\t"
    "pcmpgth    $f22, $f22, $f30                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
    "pcmpgth    $f28, $f16, $f24                          \n\t"
    "pcmpgth    $f30, $f18, $f26                          \n\t"
    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"

    "and        $f20, $f20, $f28                          \n\t"
    "and        $f22, $f22, $f30                          \n\t"
    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
    "psubh      $f28, $f28, $f24                          \n\t"
    "psubh      $f30, $f30, $f26                          \n\t"
    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
    "psubh      $f24, $f24, $f0                           \n\t"
    "psubh      $f26, $f26, $f2                           \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
    "pcmpgth    $f16, $f16, $f28                          \n\t"
    "pcmpgth    $f18, $f18, $f30                          \n\t"
    "gslqc1     $f30, $f28, 432-96(%[tmp])                \n\t"
    "and        $f20, $f20, $f16                          \n\t"
    "and        $f22, $f22, $f18                          \n\t"
    "xor        $f0, $f0, $f0                             \n\t"

    "paddh      $f8, $f8, $f28                            \n\t"
    "paddh      $f10, $f10, $f30                          \n\t"
    "pcmpgth    $f16, $f4, $f0                            \n\t"
    "pcmpgth    $f18, $f6, $f0                            \n\t"
    "pcmpeqh    $f28, $f4, $f0                            \n\t"
    "pcmpeqh    $f30, $f6, $f0                            \n\t"
    "or         $f16, $f16, $f28                          \n\t"
    "or         $f18, $f18, $f30                          \n\t"
    "and        $f20, $f20, $f16                          \n\t"
    "and        $f22, $f22, $f18                          \n\t"
    "gslqc1     $f18, $f16, 432-224(%[tmp])               \n\t"
    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
    "dli        $11, 0x2                                  \n\t"
    "psubh      $f28, $f0, $f16                           \n\t"
    "psubh      $f30, $f0, $f18                           \n\t"
    "psubh      $f2, $f0, $f6                             \n\t"
    "psubh      $f0, $f0, $f4                             \n\t"
    "dmfc1      %[iAlpha], $f28                           \n\t"
    "dmtc1      $11, $f28                                 \n\t"
    "psllh      $f20, $f20, $f28                          \n\t"
    "psllh      $f22, $f22, $f28                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "paddh      $f24, $f24, $f20                          \n\t"
    "paddh      $f26, $f26, $f22                          \n\t"
    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
    "paddh      $f24, $f24, $f20                          \n\t"
    "paddh      $f26, $f26, $f22                          \n\t"
    "gslqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
    "dli        $11, 0x3                                  \n\t"
    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
    "dmfc1      %[iAlpha], $f0                            \n\t"
    "dmtc1      $11, $f0                                  \n\t"
    "psrah      $f24, $f24, $f0                           \n\t"
    "psrah      $f26, $f26, $f0                           \n\t"
    "dmtc1      %[iAlpha], $f0                            \n\t"
    "pmaxsh     $f28, $f28, $f24                          \n\t"
    "pmaxsh     $f30, $f30, $f26                          \n\t"
    "pminsh     $f16, $f16, $f28                          \n\t"
    "pminsh     $f18, $f18, $f30                          \n\t"
    "gslqc1     $f30, $f28, 432-320(%[tmp])               \n\t"
    "and        $f16, $f16, $f28                          \n\t"
    "and        $f18, $f18, $f30                          \n\t"
    "mov.d      $f24, $f0                                 \n\t"
    "mov.d      $f26, $f2                                 \n\t"
    "gslqc1     $f2, $f0, 432-16(%[tmp])                  \n\t"
    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
    "paddh      $f0, $f0, $f28                            \n\t"
    "paddh      $f2, $f2, $f30                            \n\t"
    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
    "gslqc1     $f18, $f16, 432-368(%[tmp])               \n\t"
    "dli        $11, 0x1                                  \n\t"
    "paddh      $f16, $f16, $f16                          \n\t"
    "paddh      $f18, $f18, $f18                          \n\t"
    "psubh      $f0, $f0, $f16                            \n\t"
    "psubh      $f2, $f2, $f18                            \n\t"

    "dmtc1      $11, $f28                                 \n\t"
    "gslqc1     $f18, $f16, 432-64(%[tmp])                \n\t"
    "psrah      $f0, $f0, $f28                            \n\t"
    "psrah      $f2, $f2, $f28                            \n\t"
    "pmaxsh     $f24, $f24, $f0                           \n\t"
    "pmaxsh     $f26, $f26, $f2                           \n\t"
    "gslqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
    "pminsh     $f28, $f4, $f24                           \n\t"
    "pminsh     $f30, $f6, $f26                           \n\t"
    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
    "and        $f28, $f28, $f24                          \n\t"
    "and        $f30, $f30, $f26                          \n\t"
    "dmfc1      %[iAlpha], $f24                           \n\t"
    "dmfc1      %[iBeta], $f26                            \n\t"
    "gslqc1     $f26, $f24, 432-288(%[tmp])               \n\t"
    "and        $f28, $f28, $f24                          \n\t"
    "and        $f30, $f30, $f26                          \n\t"
    "paddh      $f20, $f20, $f28                          \n\t"
    "paddh      $f22, $f22, $f30                          \n\t"
    "packushb   $f8, $f8, $f10                            \n\t"
    "packushb   $f10, $f20, $f22                          \n\t"
    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
    "paddh      $f0, $f0, $f20                            \n\t"
    "paddh      $f2, $f2, $f22                            \n\t"
    "paddh      $f12, $f12, $f16                          \n\t"
    "paddh      $f14, $f14, $f18                          \n\t"
    "packushb   $f12, $f12, $f14                          \n\t"
    "packushb   $f14, $f0, $f2                            \n\t"

    "gslqc1     $f2, $f0, 432-32(%[tmp])                  \n\t"
    "psubh      $f0, $f0, $f16                            \n\t"
    "psubh      $f2, $f2, $f18                            \n\t"
    "gslqc1     $f18, $f16, 432-80(%[tmp])                \n\t"
    "psubh      $f16, $f16, $f20                          \n\t"
    "gslqc1     $f26, $f24, 432-48(%[tmp])                \n\t"
    "psubh      $f18, $f18, $f22                          \n\t"

    "gslqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
    "paddh      $f20, $f20, $f24                          \n\t"
    "paddh      $f22, $f22, $f26                          \n\t"
    "gslqc1     $f26, $f24, 432-304(%[tmp])               \n\t"
    "packushb   $f0, $f0, $f2                             \n\t"
    "packushb   $f2, $f16, $f18                           \n\t"
    "gslqc1     $f18, $f16, 432-384(%[tmp])               \n\t"
    "paddh      $f16, $f16, $f24                          \n\t"
    "paddh      $f18, $f18, $f26                          \n\t"
    "gssqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
    "mov.d      $f28, $f0                                 \n\t"
    "mov.d      $f30, $f2                                 \n\t"
    "paddh      $f0, $f0, $f0                             \n\t"
    "paddh      $f2, $f2, $f2                             \n\t"

    "dmtc1      %[iAlpha], $f24                           \n\t"
    "dmtc1      %[iBeta], $f26                            \n\t"

    "psubh      $f16, $f16, $f0                           \n\t"
    "psubh      $f18, $f18, $f2                           \n\t"
    "dli        $11, 0x1                                  \n\t"
    "gslqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
    "gssqc1     $f10, $f8, 0x0($9)                        \n\t"
    "dmtc1      $11, $f8                                  \n\t"
    "psrah      $f16, $f16, $f8                           \n\t"
    "psrah      $f18, $f18, $f8                           \n\t"
    "pmaxsh     $f0, $f0, $f16                            \n\t"
    "pmaxsh     $f2, $f2, $f18                            \n\t"
    "pminsh     $f4, $f4, $f0                             \n\t"
    "pminsh     $f6, $f6, $f2                             \n\t"
    "gslqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"

    "gslqc1     $f10, $f8, 428-256+4(%[tmp])              \n\t"
    "and        $f4, $f4, $f24                            \n\t"
    "and        $f6, $f6, $f26                            \n\t"
    "and        $f4, $f4, $f8                             \n\t"
    "and        $f6, $f6, $f10                            \n\t"
    "gssqc1     $f14, $f12, 0x0($13)                      \n\t"
    "paddh      $f28, $f28, $f4                           \n\t"
    "paddh      $f30, $f30, $f6                           \n\t"
    "packushb   $f20, $f20, $f22                          \n\t"
    "packushb   $f22, $f28, $f30                          \n\t"
    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
    "gssqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
    : [pPix]"+&r"((unsigned char *)pPix)
    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
      "$f22", "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
                                 uint8_t *pDst) {
  BACKUP_REG;
  __asm__ volatile(
    ".set       arch=loongson3a                           \n\t"
    "dsll       $8, %[iStride], 0x3                       \n\t"
    "daddu      $8, $8, %[pPixY]                          \n\t"

    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
    "daddu      $10, $8, %[iStride]                       \n\t"
    "gsldlc1    $f0, 0x7(%[pPixY])                        \n\t"
    "gsldlc1    $f2, 0x7($8)                              \n\t"
    "gsldlc1    $f4, 0x7($9)                              \n\t"
    "gsldlc1    $f6, 0x7($10)                             \n\t"
    "gsldrc1    $f0, 0x0(%[pPixY])                        \n\t"
    "gsldrc1    $f2, 0x0($8)                              \n\t"
    "gsldrc1    $f4, 0x0($9)                              \n\t"
    "gsldrc1    $f6, 0x0($10)                             \n\t"
    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
    "daddu      $8, $10, %[iStride]                       \n\t"
    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
    "daddu      $10, $8, %[iStride]                       \n\t"
    "gsldlc1    $f8, 0x7(%[pPixY])                        \n\t"
    "gsldlc1    $f10, 0x7($8)                             \n\t"
    "gsldlc1    $f12, 0x7($9)                             \n\t"
    "gsldlc1    $f14, 0x7($10)                            \n\t"
    "gsldrc1    $f8, 0x0(%[pPixY])                        \n\t"
    "gsldrc1    $f10, 0x0($8)                             \n\t"
    "gsldrc1    $f12, 0x0($9)                             \n\t"
    "gsldrc1    $f14, 0x0($10)                            \n\t"

    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
    "daddu      $8, $10, %[iStride]                       \n\t"
    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
    "daddu      $10, $8, %[iStride]                       \n\t"
    "gsldlc1    $f16, 0x7(%[pPixY])                       \n\t"
    "gsldlc1    $f18, 0x7($8)                             \n\t"
    "gsldlc1    $f20, 0x7($9)                             \n\t"
    "gsldlc1    $f22, 0x7($10)                            \n\t"
    "gsldrc1    $f16, 0x0(%[pPixY])                       \n\t"
    "gsldrc1    $f18, 0x0($8)                             \n\t"
    "gsldrc1    $f20, 0x0($9)                             \n\t"
    "gsldrc1    $f22, 0x0($10)                            \n\t"
    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
    "daddu      $8, $10, %[iStride]                       \n\t"
    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
    "daddu      $10, $8, %[iStride]                       \n\t"
    "gsldlc1    $f24, 0x7(%[pPixY])                       \n\t"
    "gsldlc1    $f26, 0x7($8)                             \n\t"

    "gsldlc1    $f28, 0x7($9)                             \n\t"
    "gsldlc1    $f30, 0x7($10)                            \n\t"
    "gsldrc1    $f24, 0x0(%[pPixY])                       \n\t"
    "gsldrc1    $f26, 0x0($8)                             \n\t"
    "gsldrc1    $f28, 0x0($9)                             \n\t"
    "gsldrc1    $f30, 0x0($10)                            \n\t"

    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
                     $f14, $f16, $f18, $f20, $f22, $f24,
                     $f26, $f28, $f30, $9, $10)

    "gssqc1     $f18, $f16, 0x0(%[pDst])                  \n\t"
    "gssqc1     $f10, $f8, 0x10(%[pDst])                  \n\t"
    "gssqc1     $f14, $f12, 0x20(%[pDst])                 \n\t"
    "gssqc1     $f30, $f28, 0x30(%[pDst])                 \n\t"
    "gssqc1     $f22, $f20, 0x40(%[pDst])                 \n\t"
    "gssqc1     $f6, $f4, 0x50(%[pDst])                   \n\t"
    "gssqc1     $f26, $f24, 0x60(%[pDst])                 \n\t"
    "gssqc1     $f2, $f0, 0x70(%[pDst])                   \n\t"
    : [pPixY] "+&r"((unsigned char *)pPixY)
    : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
      "$f30"
  );
  RECOVER_REG;
}

void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
                                 uint8_t *pSrc) {
  BACKUP_REG;
  __asm__ volatile(
    ".set       arch=loongson3a                           \n\t"
    "gslqc1     $f2, $f0, 0x0(%[pSrc])                    \n\t"
    "gslqc1     $f6, $f4, 0x10(%[pSrc])                   \n\t"
    "gslqc1     $f10, $f8, 0x20(%[pSrc])                  \n\t"
    "gslqc1     $f14, $f12, 0x30(%[pSrc])                 \n\t"
    "gslqc1     $f18, $f16, 0x40(%[pSrc])                 \n\t"
    "gslqc1     $f22, $f20, 0x50(%[pSrc])                 \n\t"
    "gslqc1     $f26, $f24, 0x60(%[pSrc])                 \n\t"
    "gslqc1     $f30, $f28, 0x70(%[pSrc])                 \n\t"

    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
                     $f14, $f16, $f18, $f20, $f22, $f24,
                     $f26, $f28, $f30, $9, $10)

    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f16, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f8, 0x7($8)                              \n\t"
    "gssdrc1    $f16, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f8, 0x0($8)                              \n\t"
    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f12, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f28, 0x7($8)                             \n\t"
    "gssdrc1    $f12, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f28, 0x0($8)                             \n\t"

    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f20, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f4, 0x7($8)                              \n\t"
    "gssdrc1    $f20, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f4, 0x0($8)                              \n\t"
    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f24, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f0, 0x7($8)                              \n\t"
    "gssdrc1    $f24, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f0, 0x0($8)                              \n\t"

    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f18, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f10, 0x7($8)                             \n\t"
    "gssdrc1    $f18, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f10, 0x0($8)                             \n\t"
    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f14, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f30, 0x7($8)                             \n\t"
    "gssdrc1    $f14, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f30, 0x0($8)                             \n\t"

    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f22, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f6, 0x7($8)                              \n\t"
    "gssdrc1    $f22, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f6, 0x0($8)                              \n\t"
    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
    "gssdlc1    $f26, 0x7(%[pPixY])                       \n\t"
    "gssdlc1    $f2, 0x7($8)                              \n\t"
    "gssdrc1    $f26, 0x0(%[pPixY])                       \n\t"
    "gssdrc1    $f2, 0x0($8)                              \n\t"
    : [pPixY] "+&r"((unsigned char *)pPixY)
    : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
      "$f30"
  );
  RECOVER_REG;
}

void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
                         int32_t iBeta) {
  unsigned char tmp[720] __attribute__((aligned(32)));
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "dsll       $11, %[iStride], 0x2                      \n\t"
    "xor        $f8, $f8, $f8                             \n\t"
    "daddu      $14, %[iStride], %[pPix]                  \n\t"
    "dsubu      $8, %[pPix], $11                          \n\t"
    "gslqc1     $f14, $f12, 0x0($8)                       \n\t"
    "gslqc1     $f22, $f20, 0x0(%[pPix])                  \n\t"
    "daddu      $9, %[iStride], %[iStride]                \n\t"
    "daddu      $10, $9, %[iStride]                       \n\t"
    "move       $12, $9                                   \n\t"
    "dsubu      $8, %[pPix], $9                           \n\t"
    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
    "dsubu      $9, %[pPix], %[iStride]                   \n\t"
    "gslqc1     $f18, $f16, 0x0($9)                       \n\t"
    "daddu      $13, %[iStride], %[pPix]                  \n\t"

    "move       %[iStride], $12                           \n\t"
    "daddu      $15, $12, %[pPix]                         \n\t"

    "daddu      $12, %[pPix], $10                         \n\t"
    "dsubu      $11, %[pPix], $10                         \n\t"

    "gslqc1     $f26, $f24, 0x0($11)                      \n\t"
    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
    "dmtc1      %[iAlpha], $f0                            \n\t"

    "punpcklhw  $f28, $f0, $f0                            \n\t"
    "punpcklwd  $f0, $f28, $f28                           \n\t"
    "mov.d      $f2, $f0                                  \n\t"
    "gssqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
    "dmtc1      %[iBeta], $f0                             \n\t"
    "gsldxc1    $f10, 0x0($15, $0)                        \n\t"
    "punpcklhw  $f28, $f0, $f0                            \n\t"
    "punpcklwd  $f0, $f28, $f28                           \n\t"
    "punpckhbh  $f30, $f10, $f8                           \n\t"
    "mov.d      $f2, $f0                                  \n\t"

    "punpcklbh  $f28, $f10, $f8                           \n\t"
    "gssqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
    "gssqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
    "mov.d      $f0, $f4                                  \n\t"
    "gssqc1     $f22, $f20, 704-272(%[tmp])               \n\t"
    "gssqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
    "mov.d      $f4, $f16                                 \n\t"
    "punpckhbh  $f22, $f20, $f8                           \n\t"
    "punpcklbh  $f20, $f20, $f8                           \n\t"
    "punpckhbh  $f6, $f4, $f8                             \n\t"
    "punpcklbh  $f4, $f4, $f8                             \n\t"

    "psubh      $f28, $f20, $f4                           \n\t"
    "psubh      $f30, $f22, $f6                           \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
    "punpckhbh  $f2, $f0, $f8                             \n\t"
    "punpcklbh  $f0, $f0, $f8                             \n\t"
    "gssqc1     $f18, $f16, 688-272(%[tmp])               \n\t"
    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
    "gssqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"

    "psubh      $f28, $f4, $f0                            \n\t"
    "psubh      $f30, $f6, $f2                            \n\t"

    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
    "punpckhbh  $f18, $f16, $f8                           \n\t"
    "punpcklbh  $f16, $f16, $f8                           \n\t"
    "pcmpgth    $f0, $f0, $f28                            \n\t"
    "pcmpgth    $f2, $f2, $f30                            \n\t"
    "gssqc1     $f18, $f16, 640-384(%[tmp])               \n\t"
    "psubh      $f28, $f20, $f16                          \n\t"
    "psubh      $f30, $f22, $f18                          \n\t"
    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
    "punpckhbh  $f26, $f24, $f8                           \n\t"
    "punpcklbh  $f24, $f24, $f8                           \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "gssqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
    "gssqc1     $f6, $f4, 640-144(%[tmp])                 \n\t"
    "gssqc1     $f22, $f20, 640-400(%[tmp])               \n\t"
    "pcmpgth    $f16, $f16, $f28                          \n\t"
    "pcmpgth    $f18, $f18, $f30                          \n\t"
    "and        $f0, $f0, $f16                            \n\t"
    "and        $f2, $f2, $f18                            \n\t"
    "gslqc1     $f18, $f16, 640-320(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
    "dli        %[iAlpha], 0x2                            \n\t"
    "dli        %[iBeta], 0x2                             \n\t"
    "pcmpgth    $f16, $f16, $f28                          \n\t"
    "pcmpgth    $f18, $f18, $f30                          \n\t"
    "and        $f0, $f0, $f16                            \n\t"
    "and        $f2, $f2, $f18                            \n\t"
    "dmtc1      %[iAlpha], $f16                           \n\t"
    "dmtc1      %[iBeta], $f10                            \n\t"
    "gssqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
    "gslqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"

    "punpcklhw  $f28, $f16, $f16                          \n\t"
    "psrah      $f16, $f0, $f10                           \n\t"
    "psrah      $f18, $f2, $f10                           \n\t"
    "punpcklwd  $f28, $f28, $f28                          \n\t"
    "mov.d      $f30, $f28                                \n\t"
    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
    "paddh      $f16, $f16, $f28                          \n\t"
    "paddh      $f18, $f18, $f30                          \n\t"
    "gssqc1     $f18, $f16, 640-576(%[tmp])               \n\t"
    "pcmpgth    $f16, $f16, $f8                           \n\t"
    "pcmpgth    $f18, $f18, $f10                          \n\t"
    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"

    "gssqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
    "psubh      $f28, $f4, $f24                           \n\t"
    "psubh      $f30, $f6, $f26                           \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
    "pcmpgth    $f16, $f16, $f28                          \n\t"
    "pcmpgth    $f18, $f18, $f30                          \n\t"

    "gslqc1     $f2, $f0, 640-416(%[tmp])                 \n\t"
    "and        $f16, $f16, $f8                           \n\t"
    "and        $f18, $f18, $f10                          \n\t"
    "gssqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
    "psubh      $f28, $f20, $f0                           \n\t"
    "psubh      $f30, $f22, $f2                           \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
    "pcmpgth    $f16, $f16, $f28                          \n\t"
    "pcmpgth    $f18, $f18, $f30                          \n\t"

    "and        $f16, $f16, $f8                           \n\t"
    "and        $f18, $f18, $f10                          \n\t"
    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"

    "gslqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
    "xor        $f8, $f8, $f8                             \n\t"
    "pandn      $f16, $f16, $f24                          \n\t"
    "dli        %[iAlpha], 0x4                            \n\t"
    "pandn      $f18, $f18, $f26                          \n\t"
    "gssqc1     $f18, $f16, 640-16(%[tmp])                \n\t"
    "dmtc1      %[iAlpha], $f16                           \n\t"
    "punpcklhw  $f28, $f16, $f16                          \n\t"
    "dli        %[iAlpha], 0x1                            \n\t"
    "punpckhbh  $f18, $f12, $f8                           \n\t"
    "dmtc1      %[iAlpha], $f30                           \n\t"
    "punpcklbh  $f16, $f12, $f8                           \n\t"
    "psllh      $f16, $f16, $f30                          \n\t"
    "psllh      $f18, $f18, $f30                          \n\t"
    "paddh      $f16, $f16, $f24                          \n\t"
    "paddh      $f18, $f18, $f26                          \n\t"
    "gslqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
    "paddh      $f16, $f16, $f24                          \n\t"
    "paddh      $f18, $f18, $f26                          \n\t"
    "paddh      $f16, $f16, $f24                          \n\t"
    "paddh      $f18, $f18, $f26                          \n\t"
    "paddh      $f16, $f16, $f0                           \n\t"
    "paddh      $f18, $f18, $f2                           \n\t"

    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
    "punpcklwd  $f28, $f28, $f28                          \n\t"
    "mov.d      $f30, $f28                                \n\t"
    "paddh      $f16, $f16, $f4                           \n\t"
    "paddh      $f18, $f18, $f6                           \n\t"
    "gssqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
    "paddh      $f16, $f16, $f20                          \n\t"
    "paddh      $f18, $f18, $f22                          \n\t"
    "paddh      $f16, $f16, $f28                          \n\t"
    "paddh      $f18, $f18, $f30                          \n\t"
    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 640-384(%[tmp])                 \n\t"
    "pandn      $f24, $f24, $f28                          \n\t"
    "pandn      $f26, $f26, $f30                          \n\t"
    "gssqc1     $f26, $f24, 640-80(%[tmp])                \n\t"
    "gslqc1     $f26, $f24, 0x0($12)                      \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "punpckhbh  $f26, $f24, $f8                           \n\t"
    "punpcklbh  $f24, $f24, $f8                           \n\t"
    "psllh      $f24, $f24, $f10                          \n\t"
    "psllh      $f26, $f26, $f10                          \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "paddh      $f24, $f24, $f0                           \n\t"
    "paddh      $f26, $f26, $f2                           \n\t"

    "dli        %[iAlpha], 0x3                            \n\t"
    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
    "paddh      $f24, $f24, $f20                          \n\t"
    "paddh      $f26, $f26, $f22                          \n\t"
    "paddh      $f24, $f24, $f4                           \n\t"
    "paddh      $f26, $f26, $f6                           \n\t"
    "paddh      $f24, $f24, $f0                           \n\t"
    "paddh      $f26, $f26, $f2                           \n\t"
    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "psrah      $f24, $f24, $f10                          \n\t"
    "psrah      $f26, $f26, $f10                          \n\t"
    "and        $f24, $f24, $f0                           \n\t"
    "and        $f26, $f26, $f2                           \n\t"
    "gssqc1     $f26, $f24, 640-112(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
    "pandn      $f24, $f24, $f28                          \n\t"
    "pandn      $f26, $f26, $f30                          \n\t"
    "gssqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
    "gssqc1     $f26, $f24, 640-528(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 640-544(%[tmp])                 \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "psrah      $f16, $f16, $f10                          \n\t"
    "psrah      $f18, $f18, $f10                          \n\t"
    "and        $f16, $f16, $f0                           \n\t"
    "and        $f18, $f18, $f2                           \n\t"
    "gslqc1     $f2, $f0, 640-624(%[tmp])                 \n\t"
    "paddh      $f28, $f4, $f20                           \n\t"
    "paddh      $f30, $f6, $f22                           \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "paddh      $f24, $f24, $f0                           \n\t"
    "paddh      $f26, $f26, $f2                           \n\t"
    "gslqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
    "dli        %[iAlpha], 0x2                            \n\t"

    "dmtc1      %[iAlpha], $f10                           \n\t"
    "paddh      $f20, $f20, $f4                           \n\t"
    "paddh      $f22, $f22, $f6                           \n\t"
    "psrah      $f24, $f24, $f10                          \n\t"
    "psrah      $f26, $f26, $f10                          \n\t"
    "and        $f28, $f28, $f24                          \n\t"
    "and        $f30, $f30, $f26                          \n\t"

    "gslqc1     $f26, $f24, 640-384(%[tmp])               \n\t"
    "gssqc1     $f30, $f28, 640-64(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
    "pandn      $f28, $f28, $f24                          \n\t"
    "pandn      $f30, $f30, $f26                          \n\t"
    "gssqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
    "paddh      $f28, $f28, $f24                          \n\t"
    "paddh      $f30, $f30, $f26                          \n\t"
    "paddh      $f28, $f28, $f20                          \n\t"
    "paddh      $f30, $f30, $f22                          \n\t"
    "paddh      $f28, $f28, $f8                           \n\t"
    "paddh      $f30, $f30, $f10                          \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "gslqc1     $f22, $f20, 640-560(%[tmp])               \n\t"
    "psrah      $f28, $f28, $f10                          \n\t"
    "psrah      $f30, $f30, $f10                          \n\t"
    "and        $f20, $f20, $f28                          \n\t"
    "and        $f22, $f22, $f30                          \n\t"
    "gssqc1     $f22, $f20, 640-32(%[tmp])                \n\t"

    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
    "paddh      $f28, $f20, $f20                          \n\t"
    "paddh      $f30, $f22, $f22                          \n\t"
    "paddh      $f20, $f4, $f24                           \n\t"
    "paddh      $f22, $f6, $f26                           \n\t"
    "paddh      $f24, $f24, $f0                           \n\t"
    "paddh      $f26, $f26, $f2                           \n\t"
    "paddh      $f28, $f28, $f20                          \n\t"
    "paddh      $f30, $f30, $f22                          \n\t"
    "paddh      $f28, $f28, $f8                           \n\t"
    "paddh      $f30, $f30, $f10                          \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "gslqc1     $f22, $f20, 640-544(%[tmp])               \n\t"
    "psrah      $f28, $f28, $f10                          \n\t"
    "psrah      $f30, $f30, $f10                          \n\t"
    "dli        %[iAlpha], 0x1                            \n\t"
    "pandn      $f20, $f20, $f28                          \n\t"
    "pandn      $f22, $f22, $f30                          \n\t"
    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
    "paddh      $f28, $f28, $f4                           \n\t"
    "paddh      $f30, $f30, $f6                           \n\t"
    "gslqc1     $f6, $f4, 640-400(%[tmp])                 \n\t"
    "paddh      $f28, $f28, $f4                           \n\t"
    "paddh      $f30, $f30, $f6                           \n\t"
    "gslqc1     $f6, $f4, 640-544(%[tmp])                 \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "gssqc1     $f22, $f20, 640-352(%[tmp])               \n\t"
    "gslqc1     $f22, $f20, 640-368(%[tmp])               \n\t"
    "psllh      $f28, $f28, $f10                          \n\t"
    "psllh      $f30, $f30, $f10                          \n\t"
    "dli        %[iAlpha], 0x3                            \n\t"
    "paddh      $f28, $f28, $f24                          \n\t"
    "paddh      $f30, $f30, $f26                          \n\t"
    "paddh      $f20, $f20, $f28                          \n\t"
    "paddh      $f22, $f22, $f30                          \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"

    "dli        %[iAlpha], 0x2                            \n\t"
    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
    "psrah      $f20, $f20, $f10                          \n\t"
    "psrah      $f22, $f22, $f10                          \n\t"
    "and        $f4, $f4, $f20                            \n\t"
    "and        $f6, $f6, $f22                            \n\t"
    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
    "gssqc1     $f6, $f4, 640-96(%[tmp])                  \n\t"
    "gslqc1     $f6, $f4, 640-384(%[tmp])                 \n\t"
    "gslqc1     $f10, $f8, 640-400(%[tmp])                \n\t"
    "paddh      $f24, $f4, $f4                            \n\t"
    "paddh      $f26, $f6, $f6                            \n\t"
    "paddh      $f4, $f4, $f8                             \n\t"
    "paddh      $f6, $f6, $f10                            \n\t"
    "gslqc1     $f10, $f8, 640-144(%[tmp])                \n\t"
    "paddh      $f28, $f28, $f20                          \n\t"
    "paddh      $f30, $f30, $f22                          \n\t"
    "paddh      $f4, $f4, $f8                             \n\t"
    "paddh      $f6, $f6, $f10                            \n\t"
    "gslqc1     $f10, $f8, 640-592(%[tmp])                \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "paddh      $f20, $f20, $f8                           \n\t"
    "paddh      $f22, $f22, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
    "paddh      $f24, $f24, $f8                           \n\t"
    "dmtc1      %[iAlpha], $f8                            \n\t"
    "paddh      $f26, $f26, $f10                          \n\t"
    "dli        %[iAlpha], 0x1                            \n\t"
    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "psrah      $f24, $f24, $f8                           \n\t"
    "psrah      $f26, $f26, $f8                           \n\t"
    "psllh      $f4, $f4, $f10                            \n\t"
    "psllh      $f6, $f6, $f10                            \n\t"
    "paddh      $f4, $f4, $f20                            \n\t"
    "paddh      $f6, $f6, $f22                            \n\t"
    "dli        %[iAlpha], 0x3                            \n\t"

    "gslqc1     $f22, $f20, 656-272(%[tmp])               \n\t"
    "pandn      $f28, $f28, $f24                          \n\t"
    "pandn      $f30, $f30, $f26                          \n\t"
    "gslqc1     $f26, $f24, 640-416(%[tmp])               \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"
    "paddh      $f24, $f24, $f4                           \n\t"
    "paddh      $f26, $f26, $f6                           \n\t"
    "gslqc1     $f6, $f4, 640-560(%[tmp])                 \n\t"
    "psrah      $f24, $f24, $f10                          \n\t"
    "psrah      $f26, $f26, $f10                          \n\t"
    "and        $f4, $f4, $f24                            \n\t"
    "and        $f6, $f6, $f26                            \n\t"

    "xor        $f8, $f8, $f8                             \n\t"
    "gslqc1     $f26, $f24, 704-272(%[tmp])               \n\t"
    "gssqc1     $f6, $f4, 640-128(%[tmp])                 \n\t"
    "gslqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
    "punpcklbh  $f4, $f6, $f8                             \n\t"
    "punpckhbh  $f6, $f6, $f8                             \n\t"
    "gssqc1     $f6, $f4, 640-448(%[tmp])                 \n\t"
    "gslqc1     $f6, $f4, 688-272(%[tmp])                 \n\t"
    "punpcklbh  $f4, $f6, $f8                             \n\t"
    "punpckhbh  $f6, $f6, $f8                             \n\t"
    "punpcklbh  $f24, $f26, $f8                           \n\t"
    "punpckhbh  $f26, $f26, $f8                           \n\t"
    "gssqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
    "punpcklbh  $f20, $f22, $f8                           \n\t"
    "punpckhbh  $f22, $f22, $f8                           \n\t"
    "gslqc1     $f30, $f28, 0x0($14)                      \n\t"
    "gssqc1     $f6, $f4, 640-496(%[tmp])                 \n\t"
    "gssqc1     $f26, $f24, 640-432(%[tmp])               \n\t"

    "gsldxc1    $f0, 0x8($15, $0)                         \n\t"
    "punpcklbh  $f28, $f30, $f8                           \n\t"
    "punpckhbh  $f30, $f30, $f8                           \n\t"
    "gssqc1     $f30, $f28, 640-464(%[tmp])               \n\t"

    "punpcklbh  $f28, $f0, $f8                            \n\t"
    "punpckhbh  $f30, $f0, $f8                            \n\t"
    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
    "gssqc1     $f30, $f28, 640-528(%[tmp])               \n\t"

    "psubh      $f28, $f24, $f4                           \n\t"
    "psubh      $f30, $f26, $f6                           \n\t"
    "psubh      $f24, $f24, $f8                           \n\t"
    "psubh      $f26, $f26, $f10                          \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "gslqc1     $f10, $f8, 640-16(%[tmp])                 \n\t"
    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
    "or         $f16, $f16, $f8                           \n\t"
    "or         $f18, $f18, $f10                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
    "psubh      $f28, $f4, $f28                           \n\t"
    "psubh      $f30, $f6, $f30                           \n\t"

    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "pcmpgth    $f4, $f0, $f28                            \n\t"
    "pcmpgth    $f6, $f2, $f30                            \n\t"
    "pcmpgth    $f28, $f0, $f24                           \n\t"
    "pcmpgth    $f30, $f2, $f26                           \n\t"
    "gslqc1     $f26, $f24, 640-320(%[tmp])               \n\t"
    "and        $f4, $f4, $f28                            \n\t"
    "and        $f6, $f6, $f30                            \n\t"
    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
    "pcmpgth    $f24, $f24, $f28                          \n\t"
    "pcmpgth    $f26, $f26, $f30                          \n\t"
    "and        $f4, $f4, $f24                            \n\t"
    "and        $f6, $f6, $f26                            \n\t"

    "gslqc1     $f26, $f24, 640-576(%[tmp])               \n\t"
    "pcmpgth    $f24, $f24, $f28                          \n\t"
    "pcmpgth    $f26, $f26, $f30                          \n\t"
    "xor        $f8, $f8, $f8                             \n\t"
    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
    "punpcklbh  $f12, $f14, $f8                           \n\t"
    "punpckhbh  $f14, $f14, $f8                           \n\t"
    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
    "psubh      $f28, $f28, $f20                          \n\t"
    "psubh      $f30, $f30, $f22                          \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "pcmpgth    $f24, $f24, $f28                          \n\t"
    "pcmpgth    $f26, $f26, $f30                          \n\t"

    "dli        %[iAlpha], 0x1                            \n\t"
    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
    "and        $f24, $f24, $f8                           \n\t"
    "and        $f26, $f26, $f10                          \n\t"
    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
    "psubh      $f28, $f28, $f8                           \n\t"
    "psubh      $f30, $f30, $f10                          \n\t"
    "dmtc1      %[iAlpha], $f10                           \n\t"

    "psllh      $f12, $f12, $f10                          \n\t"
    "psllh      $f14, $f14, $f10                          \n\t"
    "gssqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"

    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
    "paddh      $f12, $f12, $f20                          \n\t"
    "paddh      $f14, $f14, $f22                          \n\t"
    "paddh      $f12, $f12, $f20                          \n\t"
    "paddh      $f14, $f14, $f22                          \n\t"
    "paddh      $f12, $f12, $f20                          \n\t"
    "paddh      $f14, $f14, $f22                          \n\t"
    "paddh      $f12, $f12, $f8                           \n\t"
    "paddh      $f14, $f14, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
    "paddh      $f12, $f12, $f8                           \n\t"
    "paddh      $f14, $f14, $f10                          \n\t"
    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
    "pcmpgth    $f24, $f24, $f28                          \n\t"
    "pcmpgth    $f26, $f26, $f30                          \n\t"
    "and        $f24, $f24, $f0                           \n\t"
    "and        $f26, $f26, $f2                           \n\t"
    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"

    "gslqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
    "dli        %[iAlpha], 0x3                            \n\t"
    "gslqc1     $f30, $f28, 640-368(%[tmp])               \n\t"
    "and        $f24, $f0, $f16                           \n\t"
    "and        $f26, $f2, $f18                           \n\t"
    "pandn      $f16, $f0, $f28                           \n\t"
    "pandn      $f18, $f2, $f30                           \n\t"
    "or         $f24, $f24, $f16                          \n\t"
    "or         $f26, $f26, $f18                          \n\t"
    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
    "paddh      $f12, $f12, $f16                          \n\t"
    "paddh      $f14, $f14, $f18                          \n\t"
    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
    "paddh      $f12, $f12, $f28                          \n\t"
    "paddh      $f14, $f14, $f30                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "psrah      $f12, $f12, $f28                          \n\t"
    "psrah      $f14, $f14, $f28                          \n\t"
    "and        $f12, $f12, $f8                           \n\t"
    "and        $f14, $f14, $f10                          \n\t"
    "pandn      $f8, $f8, $f20                            \n\t"
    "pandn      $f10, $f10, $f22                          \n\t"
    "or         $f12, $f12, $f8                           \n\t"
    "or         $f14, $f14, $f10                          \n\t"
    "and        $f28, $f4, $f12                           \n\t"
    "and        $f30, $f6, $f14                           \n\t"
    "gslqc1     $f14, $f12, 640-64(%[tmp])                \n\t"
    "gslqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
    "or         $f12, $f12, $f8                           \n\t"
    "or         $f14, $f14, $f10                          \n\t"
    "pandn      $f8, $f4, $f20                            \n\t"
    "pandn      $f10, $f6, $f22                           \n\t"
    "or         $f28, $f28, $f8                           \n\t"
    "or         $f30, $f30, $f10                          \n\t"

    "dli        %[iAlpha], 0x2                            \n\t"
    "and        $f8, $f0, $f12                            \n\t"
    "and        $f10, $f2, $f14                           \n\t"
    "gslqc1     $f14, $f12, 640-480(%[tmp])               \n\t"
    "pandn      $f12, $f0, $f12                           \n\t"
    "pandn      $f14, $f2, $f14                           \n\t"
    "or         $f8, $f8, $f12                            \n\t"
    "or         $f10, $f10, $f14                          \n\t"
    "packushb   $f24, $f24, $f26                          \n\t"
    "packushb   $f26, $f28, $f30                          \n\t"
    "gssqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
    "paddh      $f8, $f20, $f8                            \n\t"
    "paddh      $f10, $f22, $f10                          \n\t"
    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
    "paddh      $f28, $f28, $f16                          \n\t"
    "paddh      $f30, $f30, $f18                          \n\t"
    "paddh      $f8, $f8, $f28                            \n\t"
    "paddh      $f10, $f10, $f30                          \n\t"
    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
    "paddh      $f8, $f8, $f28                            \n\t"
    "paddh      $f10, $f10, $f30                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "psrah      $f8, $f8, $f28                            \n\t"
    "psrah      $f10, $f10, $f28                          \n\t"
    "dli        %[iAlpha], 0x1                            \n\t"
    "gslqc1     $f30, $f28, 640-544(%[tmp])               \n\t"
    "and        $f24, $f24, $f8                           \n\t"
    "and        $f26, $f26, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
    "pandn      $f28, $f28, $f8                           \n\t"
    "pandn      $f30, $f30, $f10                          \n\t"
    "or         $f24, $f24, $f28                          \n\t"
    "or         $f26, $f26, $f30                          \n\t"
    "and        $f12, $f4, $f24                           \n\t"
    "and        $f14, $f6, $f26                           \n\t"
    "pandn      $f24, $f4, $f8                            \n\t"
    "pandn      $f26, $f6, $f10                           \n\t"
    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
    "paddh      $f8, $f8, $f28                            \n\t"
    "paddh      $f10, $f10, $f30                          \n\t"
    "paddh      $f8, $f8, $f16                            \n\t"
    "paddh      $f10, $f10, $f18                          \n\t"
    "or         $f12, $f12, $f24                          \n\t"
    "or         $f14, $f14, $f26                          \n\t"
    "gslqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "packushb   $f24, $f24, $f26                          \n\t"
    "packushb   $f26, $f12, $f14                          \n\t"
    "psllh      $f8, $f8, $f28                            \n\t"
    "psllh      $f10, $f10, $f28                          \n\t"
    "gssqc1     $f26, $f24, 672-272(%[tmp])               \n\t"
    "gslqc1     $f26, $f24, 640-96(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-352(%[tmp])               \n\t"
    "or         $f24, $f24, $f28                          \n\t"
    "or         $f26, $f26, $f30                          \n\t"
    "dli        %[iAlpha], 0x3                            \n\t"

    "and        $f12, $f0, $f24                           \n\t"
    "and        $f14, $f2, $f26                           \n\t"
    "gslqc1     $f26, $f24, 640-144(%[tmp])               \n\t"
    "pandn      $f24, $f0, $f24                           \n\t"
    "pandn      $f26, $f2, $f26                           \n\t"
    "or         $f12, $f12, $f24                          \n\t"
    "or         $f14, $f14, $f26                          \n\t"
    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
    "gssqc1     $f14, $f12, 640-352(%[tmp])               \n\t"
    "gslqc1     $f14, $f12, 640-464(%[tmp])               \n\t"
    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
    "paddh      $f12, $f12, $f28                          \n\t"
    "paddh      $f14, $f14, $f30                          \n\t"
    "paddh      $f8, $f8, $f12                            \n\t"
    "paddh      $f10, $f10, $f14                          \n\t"
    "gslqc1     $f14, $f12, 640-448(%[tmp])               \n\t"
    "paddh      $f20, $f20, $f8                           \n\t"
    "paddh      $f22, $f22, $f10                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
    "psrah      $f20, $f20, $f28                          \n\t"
    "psrah      $f22, $f22, $f28                          \n\t"
    "and        $f24, $f24, $f20                          \n\t"
    "and        $f26, $f26, $f22                          \n\t"
    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
    "paddh      $f8, $f8, $f20                            \n\t"
    "paddh      $f10, $f10, $f22                          \n\t"
    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
    "dli        %[iAlpha], 0x2                            \n\t"
    "paddh      $f20, $f20, $f28                          \n\t"
    "paddh      $f22, $f22, $f30                          \n\t"
    "paddh      $f16, $f12, $f12                          \n\t"
    "paddh      $f18, $f14, $f14                          \n\t"
    "paddh      $f16, $f16, $f8                           \n\t"
    "paddh      $f18, $f18, $f10                          \n\t"
    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
    "paddh      $f16, $f16, $f28                          \n\t"
    "paddh      $f18, $f18, $f30                          \n\t"
    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
    "paddh      $f12, $f12, $f28                          \n\t"
    "paddh      $f14, $f14, $f30                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "psrah      $f16, $f16, $f28                          \n\t"
    "psrah      $f18, $f18, $f28                          \n\t"
    "pandn      $f8, $f8, $f16                            \n\t"
    "pandn      $f10, $f10, $f18                          \n\t"
    "or         $f24, $f24, $f8                           \n\t"
    "or         $f26, $f26, $f10                          \n\t"
    "and        $f28, $f4, $f24                           \n\t"
    "and        $f30, $f6, $f26                           \n\t"
    "gslqc1     $f26, $f24, 640-496(%[tmp])               \n\t"
    "pandn      $f8, $f4, $f24                            \n\t"
    "pandn      $f10, $f6, $f26                           \n\t"
    "or         $f28, $f28, $f8                           \n\t"
    "or         $f30, $f30, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-352(%[tmp])                \n\t"
    "packushb   $f8, $f8, $f10                            \n\t"
    "packushb   $f10, $f28, $f30                          \n\t"
    "gssqc1     $f10, $f8, 688-272(%[tmp])                \n\t"
    "gslqc1     $f10, $f8, 640-128(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
    "or         $f8, $f8, $f28                            \n\t"
    "or         $f10, $f10, $f30                          \n\t"
    "dli        %[iAlpha], 0x1                            \n\t"

    "and        $f16, $f0, $f8                            \n\t"
    "and        $f18, $f2, $f10                           \n\t"
    "paddh      $f20, $f20, $f24                          \n\t"
    "paddh      $f22, $f22, $f26                          \n\t"
    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
    "pandn      $f8, $f0, $f28                            \n\t"
    "pandn      $f10, $f2, $f30                           \n\t"
    "or         $f16, $f16, $f8                           \n\t"
    "or         $f18, $f18, $f10                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
    "dli        %[iAlpha], 0x3                            \n\t"
    "psllh      $f20, $f20, $f28                          \n\t"
    "psllh      $f22, $f22, $f28                          \n\t"
    "paddh      $f20, $f20, $f12                          \n\t"
    "paddh      $f22, $f22, $f14                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
    "paddh      $f8, $f8, $f20                            \n\t"
    "paddh      $f10, $f10, $f22                          \n\t"
    "psrah      $f8, $f8, $f28                            \n\t"
    "psrah      $f10, $f10, $f28                          \n\t"
    "gssqc1     $f18, $f16, 640-288(%[tmp])               \n\t"
    "gslqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
    "and        $f16, $f16, $f8                           \n\t"
    "and        $f18, $f18, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
    "paddh      $f20, $f8, $f8                            \n\t"
    "paddh      $f22, $f10, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-432(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
    "paddh      $f8, $f8, $f28                            \n\t"
    "paddh      $f10, $f10, $f30                          \n\t"
    "dli        %[iAlpha], 0x2                            \n\t"
    "paddh      $f20, $f20, $f8                           \n\t"
    "paddh      $f22, $f22, $f10                          \n\t"
    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
    "paddh      $f20, $f20, $f28                          \n\t"
    "paddh      $f22, $f22, $f30                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
    "psrah      $f20, $f20, $f28                          \n\t"
    "psrah      $f22, $f22, $f28                          \n\t"
    "pandn      $f12, $f12, $f20                          \n\t"
    "pandn      $f14, $f14, $f22                          \n\t"
    "or         $f16, $f16, $f12                          \n\t"
    "or         $f18, $f18, $f14                          \n\t"
    "gslqc1     $f14, $f12, 640-32(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
    "or         $f12, $f12, $f28                          \n\t"
    "or         $f14, $f14, $f30                          \n\t"
    "and        $f28, $f4, $f16                           \n\t"
    "and        $f30, $f6, $f18                           \n\t"
    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
    "pandn      $f8, $f4, $f16                            \n\t"
    "pandn      $f10, $f6, $f18                           \n\t"
    "or         $f28, $f28, $f8                           \n\t"
    "or         $f30, $f30, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
    "paddh      $f16, $f16, $f8                           \n\t"
    "paddh      $f18, $f18, $f10                          \n\t"
    "gslqc1     $f10, $f8, 640-288(%[tmp])                \n\t"
    "packushb   $f8, $f8, $f10                            \n\t"
    "packushb   $f10, $f28, $f30                          \n\t"
    "dli        %[iAlpha], 0x2                            \n\t"
    "gssqc1     $f10, $f8, 704-272(%[tmp])                \n\t"

    "and        $f8, $f0, $f12                            \n\t"
    "and        $f10, $f2, $f14                           \n\t"
    "gslqc1     $f30, $f28, 640-384(%[tmp])               \n\t"
    "pandn      $f12, $f0, $f28                           \n\t"
    "pandn      $f14, $f2, $f30                           \n\t"
    "or         $f8, $f8, $f12                            \n\t"
    "or         $f10, $f10, $f14                          \n\t"
    "gssqc1     $f10, $f8, 640-304(%[tmp])                \n\t"
    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
    "gslqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
    "paddh      $f12, $f8, $f28                           \n\t"
    "paddh      $f14, $f10, $f30                          \n\t"
    "paddh      $f12, $f12, $f16                          \n\t"
    "paddh      $f14, $f14, $f18                          \n\t"
    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
    "paddh      $f12, $f12, $f28                          \n\t"
    "paddh      $f14, $f14, $f30                          \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "psrah      $f12, $f12, $f28                          \n\t"
    "psrah      $f14, $f14, $f28                          \n\t"
    "and        $f24, $f24, $f12                          \n\t"
    "and        $f26, $f26, $f14                          \n\t"
    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
    "pandn      $f16, $f12, $f20                          \n\t"
    "pandn      $f18, $f14, $f22                          \n\t"
    "or         $f24, $f24, $f16                          \n\t"
    "or         $f26, $f26, $f18                          \n\t"
    "and        $f28, $f4, $f24                           \n\t"
    "and        $f30, $f6, $f26                           \n\t"
    "gslqc1     $f26, $f24, 640-304(%[tmp])               \n\t"
    "pandn      $f16, $f4, $f20                           \n\t"
    "pandn      $f18, $f6, $f22                           \n\t"
    "or         $f28, $f28, $f16                          \n\t"
    "or         $f30, $f30, $f18                          \n\t"
    "dli        %[iAlpha], 0x1                            \n\t"

    "packushb   $f24, $f24, $f26                          \n\t"
    "packushb   $f26, $f28, $f30                          \n\t"
    "gslqc1     $f30, $f28, 640-112(%[tmp])               \n\t"
    "gslqc1     $f18, $f16, 640-80(%[tmp])                \n\t"
    "or         $f28, $f28, $f16                          \n\t"
    "or         $f30, $f30, $f18                          \n\t"
    "and        $f16, $f0, $f28                           \n\t"
    "and        $f18, $f2, $f30                           \n\t"
    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
    "pandn      $f0, $f0, $f28                            \n\t"
    "pandn      $f2, $f2, $f30                            \n\t"
    "or         $f16, $f16, $f0                           \n\t"
    "or         $f18, $f18, $f2                           \n\t"
    "xor        $f28, $f28, $f28                          \n\t"
    "xor        $f30, $f30, $f30                          \n\t"
    "gslqc1     $f2, $f0, 0x0($12)                        \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "punpcklbh  $f0, $f2, $f30                            \n\t"
    "punpckhbh  $f2, $f2, $f30                            \n\t"
    "psllh      $f0, $f0, $f28                            \n\t"
    "psllh      $f2, $f2, $f28                            \n\t"
    "paddh      $f0, $f0, $f8                             \n\t"
    "paddh      $f2, $f2, $f10                            \n\t"
    "paddh      $f0, $f0, $f8                             \n\t"
    "paddh      $f2, $f2, $f10                            \n\t"
    "paddh      $f0, $f0, $f8                             \n\t"
    "paddh      $f2, $f2, $f10                            \n\t"
    "paddh      $f0, $f0, $f20                            \n\t"
    "paddh      $f2, $f2, $f22                            \n\t"
    "dli        %[iAlpha], 0x3                            \n\t"
    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
    "paddh      $f0, $f0, $f28                            \n\t"
    "paddh      $f2, $f2, $f30                            \n\t"
    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
    "paddh      $f0, $f0, $f28                            \n\t"
    "paddh      $f2, $f2, $f30                            \n\t"
    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
    "paddh      $f0, $f0, $f28                            \n\t"
    "paddh      $f2, $f2, $f30                            \n\t"
    "dmtc1      %[iAlpha], $f28                           \n\t"
    "psrah      $f0, $f0, $f28                            \n\t"
    "psrah      $f2, $f2, $f28                            \n\t"
    "and        $f0, $f0, $f12                            \n\t"
    "and        $f2, $f2, $f14                            \n\t"
    "pandn      $f12, $f12, $f8                           \n\t"
    "pandn      $f14, $f14, $f10                          \n\t"
    "or         $f0, $f0, $f12                            \n\t"
    "or         $f2, $f2, $f14                            \n\t"
    "and        $f28, $f4, $f0                            \n\t"
    "and        $f30, $f6, $f2                            \n\t"

    "gslqc1     $f2, $f0, 656-272(%[tmp])                 \n\t"
    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"

    "gslqc1     $f2, $f0, 672-272(%[tmp])                 \n\t"

    "gssqc1     $f2, $f0, 0x0($8)                         \n\t"
    "gslqc1     $f2, $f0, 688-272(%[tmp])                 \n\t"
    "gssqc1     $f2, $f0, 0x0($9)                         \n\t"
    "gslqc1     $f2, $f0, 704-272(%[tmp])                 \n\t"

    "pandn      $f4, $f4, $f8                             \n\t"
    "pandn      $f6, $f6, $f10                            \n\t"
    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
    "or         $f28, $f28, $f4                           \n\t"
    "or         $f30, $f30, $f6                           \n\t"
    "packushb   $f16, $f16, $f18                          \n\t"
    "packushb   $f18, $f28, $f30                          \n\t"
    "gssqc1     $f26, $f24, 0x0($13)                      \n\t"
    "gssqc1     $f18, $f16, 0x0(%[iStride])               \n\t"
    : [pPix]"+&r"((unsigned char *)pPix)
    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
      "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
      "$f22", "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
  unsigned char tmp[256] __attribute__((aligned(32)));
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "lb         $8, 0x2(%[pTC])                           \n\t"
    "lb         $9, 0x3(%[pTC])                           \n\t"
    "move       $11, $8                                   \n\t"
    "lb         $8, 0x1(%[pTC])                           \n\t"
    "lb         %[pTC], 0x0(%[pTC])                       \n\t"
    "move       $12, %[pTC]                               \n\t"
    "and        %[pTC], $9, 0xFFFF                        \n\t"
    "dmtc1      %[pTC], $f4                               \n\t"
    "and        %[pTC], $9, 0xFFFF                        \n\t"
    "dmtc1      %[pTC], $f8                               \n\t"
    "move       %[pTC], $11                               \n\t"
    "and        $9, %[pTC], 0xFFFF                        \n\t"
    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
    "dmtc1      %[pTC], $f16                              \n\t"
    "and        %[pTC], $8, 0xFFFF                        \n\t"
    "dmtc1      %[pTC], $f20                              \n\t"
    "dmtc1      $9, $f12                                  \n\t"
    "and        %[pTC], $8, 0xFFFF                        \n\t"
    "dmtc1      %[pTC], $f24                              \n\t"
    "move       %[pTC], $12                               \n\t"
    "and        $9, %[pTC], 0xFFFF                        \n\t"
    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
    "punpcklhw  $f24, $f24, $f8                           \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "xor        $f2, $f2, $f2                             \n\t"
    "gssqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
    "dmtc1      $9, $f28                                  \n\t"
    "dmtc1      %[pTC], $f0                               \n\t"
    "daddu      %[pTC], %[iStride], %[iStride]            \n\t"
    "dsubu      $9, %[pPixCb], %[pTC]                     \n\t"
    "punpcklhw  $f20, $f20, $f4                           \n\t"
    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
    "punpcklhw  $f0, $f0, $f16                            \n\t"
    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCr])          \n\t"
    "punpcklhw  $f28, $f28, $f12                          \n\t"
    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
    "punpcklhw  $f0, $f0, $f24                            \n\t"
    "gsldxc1    $f24, 0x0($9, $0)                         \n\t"
    "punpcklhw  $f28, $f28, $f20                          \n\t"
    "punpckhhw  $f2, $f0, $f28                            \n\t"
    "punpcklhw  $f0, $f0, $f28                            \n\t"
    "dsubu      $9, %[pPixCr], %[pTC]                     \n\t"
    "psubh      $f8, $f4, $f0                             \n\t"
    "psubh      $f10, $f6, $f2                            \n\t"
    "gssqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
    "mov.d      $f26, $f8                                 \n\t"
    "dsubu      %[pTC], %[pPixCb], %[iStride]             \n\t"
    "gsldxc1    $f28, 0x0(%[pTC], $0)                     \n\t"
    "dsubu      $9, %[pPixCr], %[iStride]                 \n\t"
    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
    "mov.d      $f30, $f8                                 \n\t"
    "gsldxc1    $f8, 0x0(%[pPixCr], $0)                   \n\t"
    "mov.d      $f14, $f8                                 \n\t"
    "gsldxc1    $f8, 0x0(%[iStride], %[pPixCb])           \n\t"
    "mov.d      $f10, $f16                                \n\t"
    "gssqc1     $f10, $f8, 0xE0(%[tmp])                   \n\t"
    "dmtc1      %[iAlpha], $f8                            \n\t"
    "punpcklhw  $f16, $f8, $f8                            \n\t"
    "dmtc1      %[iBeta], $f8                             \n\t"
    "punpcklhw  $f20, $f8, $f8                            \n\t"
    "punpcklwd  $f8, $f20, $f20                           \n\t"
    "mov.d      $f10, $f8                                 \n\t"
    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
    "punpckhbh  $f10, $f24, $f4                           \n\t"
    "punpcklbh  $f8, $f24, $f4                            \n\t"
    "gssqc1     $f14, $f12, 0xd0(%[tmp])                  \n\t"
    "punpcklwd  $f16, $f16, $f16                          \n\t"
    "mov.d      $f18, $f16                                \n\t"
    "gssqc1     $f10, $f8, 0x30(%[tmp])                   \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"
    "gssqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0xd0(%[tmp])                  \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"
    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0xe0(%[tmp])                  \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"
    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
    "gslqc1     $f22, $f20, 0xe0(%[tmp])                  \n\t"
    "mov.d      $f8, $f28                                 \n\t"
    "mov.d      $f10, $f30                                \n\t"
    "punpcklbh  $f28, $f30, $f6                           \n\t"
    "punpckhbh  $f30, $f30, $f6                           \n\t"
    "punpckhbh  $f22, $f20, $f4                           \n\t"
    "punpcklbh  $f20, $f20, $f4                           \n\t"
    "gssqc1     $f30, $f28, 0xa0(%[tmp])                  \n\t"
    "punpckhbh  $f14, $f12, $f4                           \n\t"
    "punpcklbh  $f12, $f12, $f4                           \n\t"
    "dli        %[iBeta], 0x4                             \n\t"
    "punpckhbh  $f10, $f8, $f4                            \n\t"
    "punpcklbh  $f8, $f8, $f4                             \n\t"
    "dmtc1      %[iBeta], $f24                            \n\t"
    "punpcklhw  $f28, $f24, $f24                          \n\t"
    "punpcklwd  $f24, $f28, $f28                          \n\t"
    "mov.d      $f26, $f24                                \n\t"
    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
    "psubh      $f28, $f28, $f20                          \n\t"
    "psubh      $f30, $f30, $f22                          \n\t"
    "pcmpgth    $f24, $f0, $f4                            \n\t"
    "pcmpgth    $f26, $f2, $f6                            \n\t"
    "gslqc1     $f6, $f4, 0x60(%[tmp])                    \n\t"
    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
    "psubh      $f24, $f12, $f8                           \n\t"
    "psubh      $f26, $f14, $f10                          \n\t"
    "dmfc1      %[iAlpha], $f12                           \n\t"
    "dmfc1      %[iBeta], $f14                            \n\t"
    "dli        $10, 0x2                                  \n\t"
    "dmtc1      $10, $f12                                 \n\t"
    "dli        $10, 0x3                                  \n\t"
    "dmtc1      $10, $f14                                 \n\t"
    "psllh      $f24, $f24, $f12                          \n\t"
    "psllh      $f26, $f26, $f12                          \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x20(%[tmp])                  \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "psrah      $f24, $f24, $f14                          \n\t"
    "psrah      $f26, $f26, $f14                          \n\t"
    "dmtc1      %[iAlpha], $f12                           \n\t"
    "dmtc1      %[iBeta], $f14                            \n\t"
    "pmaxsh     $f4, $f4, $f24                            \n\t"
    "pmaxsh     $f6, $f6, $f26                            \n\t"
    "gssqc1     $f2, $f0, 0x10(%[tmp])                    \n\t"
    "gslqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
    "pminsh     $f24, $f24, $f4                           \n\t"
    "pminsh     $f26, $f26, $f6                           \n\t"
    "gssqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
    "psubh      $f4, $f8, $f12                            \n\t"
    "psubh      $f6, $f10, $f14                           \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
    "pcmpgth    $f24, $f16, $f4                           \n\t"
    "pcmpgth    $f26, $f18, $f6                           \n\t"
    "gslqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
    "psubh      $f4, $f4, $f8                             \n\t"
    "psubh      $f6, $f6, $f10                            \n\t"
    "dmfc1      %[iAlpha], $f8                            \n\t"
    "dmfc1      %[iBeta], $f10                            \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
    "pcmpgth    $f28, $f28, $f4                           \n\t"
    "pcmpgth    $f30, $f30, $f6                           \n\t"
    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
    "and        $f24, $f24, $f28                          \n\t"
    "and        $f26, $f26, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "psubh      $f20, $f20, $f12                          \n\t"
    "psubh      $f22, $f22, $f14                          \n\t"
    WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
    "pcmpgth    $f4, $f4, $f20                            \n\t"
    "pcmpgth    $f6, $f6, $f22                            \n\t"
    "gslqc1     $f22, $f20, 0x80(%[tmp])                  \n\t"
    "gslqc1     $f10, $f8, 0x90(%[tmp])                   \n\t"
    "psubh      $f20, $f20, $f8                           \n\t"
    "psubh      $f22, $f22, $f10                          \n\t"
    "and        $f24, $f24, $f4                           \n\t"
    "and        $f26, $f26, $f6                           \n\t"
    "gslqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
    "and        $f24, $f24, $f8                           \n\t"
    "and        $f26, $f26, $f10                          \n\t"
    "gslqc1     $f6, $f4, 0x10(%[tmp])                    \n\t"
    "and        $f4, $f4, $f24                            \n\t"
    "and        $f6, $f6, $f26                            \n\t"
    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
    "gssqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
    "gslqc1     $f6, $f4, 0xa0(%[tmp])                    \n\t"
    "psubh      $f24, $f24, $f4                           \n\t"
    "psubh      $f26, $f26, $f6                           \n\t"
    "dli        $10, 0x2                                  \n\t"
    "dmtc1      $10, $f8                                  \n\t"
    "psllh      $f24, $f24, $f8                           \n\t"
    "psllh      $f26, $f26, $f8                           \n\t"
    "paddh      $f24, $f24, $f20                          \n\t"
    "paddh      $f26, $f26, $f22                          \n\t"
    "dli        $10, 0x3                                  \n\t"
    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
    "paddh      $f24, $f24, $f8                           \n\t"
    "paddh      $f26, $f26, $f10                          \n\t"
    "dmtc1      $10, $f8                                  \n\t"
    "gslqc1     $f22, $f20, 0x60(%[tmp])                  \n\t"
    "psrah      $f24, $f24, $f8                           \n\t"
    "psrah      $f26, $f26, $f8                           \n\t"
    "pmaxsh     $f20, $f20, $f24                          \n\t"
    "pmaxsh     $f22, $f22, $f26                          \n\t"
    "pminsh     $f0, $f0, $f20                            \n\t"
    "pminsh     $f2, $f2, $f22                            \n\t"
    "gslqc1     $f22, $f20, 0x70(%[tmp])                  \n\t"
    "psubh      $f24, $f4, $f20                           \n\t"
    "psubh      $f26, $f6, $f22                           \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
    "pcmpgth    $f16, $f16, $f24                          \n\t"
    "pcmpgth    $f18, $f18, $f26                          \n\t"
    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f4                           \n\t"
    "psubh      $f26, $f26, $f6                           \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
    "pcmpgth    $f28, $f28, $f24                          \n\t"
    "pcmpgth    $f30, $f30, $f26                          \n\t"
    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
    "and        $f16, $f16, $f28                          \n\t"
    "and        $f18, $f18, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f20                          \n\t"
    "psubh      $f26, $f26, $f22                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
    "dmtc1      %[iAlpha], $f8                            \n\t"
    "dmtc1      %[iBeta], $f10                            \n\t"
    "pcmpgth    $f28, $f28, $f24                          \n\t"
    "pcmpgth    $f30, $f30, $f26                          \n\t"
    "and        $f16, $f16, $f28                          \n\t"
    "and        $f18, $f18, $f30                          \n\t"
    "gslqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
    "and        $f16, $f16, $f24                          \n\t"
    "and        $f18, $f18, $f26                          \n\t"
    "and        $f0, $f0, $f16                            \n\t"
    "and        $f2, $f2, $f18                            \n\t"
    "gslqc1     $f18, $f16, 0x30(%[tmp])                  \n\t"
    "paddh      $f8, $f8, $f16                            \n\t"
    "paddh      $f10, $f10, $f18                          \n\t"
    "paddh      $f4, $f4, $f0                             \n\t"
    "paddh      $f6, $f6, $f2                             \n\t"
    "packushb   $f8, $f8, $f10                            \n\t"
    "packushb   $f10, $f4, $f6                            \n\t"
    "gssdxc1    $f8, 0x0(%[pTC], $0)                      \n\t"
    "psubh      $f12, $f12, $f16                          \n\t"
    "psubh      $f14, $f14, $f18                          \n\t"
    "psubh      $f20, $f20, $f0                           \n\t"
    "psubh      $f22, $f22, $f2                           \n\t"
    "packushb   $f12, $f12, $f14                          \n\t"
    "packushb   $f14, $f20, $f22                          \n\t"
    "gssdxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
    "gssdxc1    $f10, 0x0($9, $0)                         \n\t"
    "gssdxc1    $f14, 0x0(%[pPixCr], $0)                  \n\t"
    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
      "$f10", "$f12",  "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
      "$f28", "$f30"
  );
  RECOVER_REG;
}

void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
                           int32_t iAlpha, int32_t iBeta) {
  unsigned char tmp[128] __attribute__((aligned(32)));
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                          \n\t"
    "daddu      $8, %[iStride], %[iStride]               \n\t"
    "dsubu      $9, %[pPixCb], $8                        \n\t"
    "gsldxc1    $f16, 0x0(%[pPixCr], $0)                 \n\t"
    "gsldxc1    $f20, 0x0(%[iStride], %[pPixCr])         \n\t"
    "gsldxc1    $f4, 0x0($9, $0)                         \n\t"
    "dsubu      $9, %[pPixCr], $8                        \n\t"
    "gsldxc1    $f8, 0x0($9, $0)                         \n\t"
    "mov.d      $f6, $f8                                 \n\t"
    "dsubu      $8, %[pPixCb], %[iStride]                \n\t"
    "gsldxc1    $f8, 0x0($8, $0)                         \n\t"
    "dsubu      $9, %[pPixCr], %[iStride]                \n\t"
    "gsldxc1    $f12, 0x0($9, $0)                        \n\t"
    "mov.d      $f10, $f12                               \n\t"
    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                 \n\t"
    "mov.d      $f14, $f16                               \n\t"
    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCb])         \n\t"
    "mov.d      $f18, $f20                               \n\t"
    "dmtc1      %[iAlpha], $f20                          \n\t"
    "xor        $f0, $f0, $f0                            \n\t"
    "xor        $f2, $f2, $f2                            \n\t"
    "punpcklhw  $f24, $f20, $f20                         \n\t"
    "punpcklwd  $f20, $f24, $f24                         \n\t"
    "mov.d      $f22, $f20                               \n\t"
    "dmtc1      %[iBeta], $f24                           \n\t"
    "punpcklhw  $f28, $f24, $f24                         \n\t"
    "punpcklwd  $f24, $f28, $f28                         \n\t"
    "mov.d      $f26, $f24                               \n\t"
    "mov.d      $f28, $f4                                \n\t"
    "punpcklbh  $f4, $f6, $f2                            \n\t"
    "punpckhbh  $f6, $f6, $f2                            \n\t"
    "punpckhbh  $f30, $f28, $f0                          \n\t"
    "punpcklbh  $f28, $f28, $f0                          \n\t"
    "gssqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
    "gssqc1     $f30, $f28, 0x60(%[tmp])                 \n\t"
    "punpckhbh  $f30, $f8, $f0                           \n\t"
    "punpcklbh  $f28, $f8, $f0                           \n\t"
    "gssqc1     $f30, $f28, 0x10(%[tmp])                 \n\t"
    "punpckhbh  $f30, $f12, $f0                          \n\t"
    "punpcklbh  $f28, $f12, $f0                          \n\t"
    "punpcklbh  $f12, $f14, $f2                          \n\t"
    "punpckhbh  $f14, $f14, $f2                          \n\t"
    "gssqc1     $f30, $f28, 0x50(%[tmp])                 \n\t"
    "mov.d      $f28, $f16                               \n\t"
    "punpcklbh  $f16, $f18, $f2                          \n\t"
    "punpckhbh  $f18, $f18, $f2                          \n\t"
    "punpcklbh  $f8, $f10, $f2                           \n\t"
    "punpckhbh  $f10, $f10, $f2                          \n\t"
    "punpckhbh  $f30, $f28, $f0                          \n\t"
    "punpcklbh  $f28, $f28, $f0                          \n\t"
    "gssqc1     $f14, $f12, 0x30(%[tmp])                 \n\t"
    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
    "gslqc1     $f2, $f0, 0x50(%[tmp])                   \n\t"
    "psubh      $f4, $f12, $f0                           \n\t"
    "psubh      $f6, $f14, $f2                           \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
    "gssqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
    "pcmpgth    $f0, $f20, $f4                           \n\t"
    "pcmpgth    $f2, $f22, $f6                           \n\t"
    "gslqc1     $f6, $f4, 0x60(%[tmp])                   \n\t"
    "psubh      $f4, $f4, $f12                           \n\t"
    "psubh      $f6, $f6, $f14                           \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
    "pcmpgth    $f16, $f24, $f4                          \n\t"
    "pcmpgth    $f18, $f26, $f6                          \n\t"
    "and        $f0, $f0, $f16                           \n\t"
    "and        $f2, $f2, $f18                           \n\t"
    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
    "psubh      $f4, $f28, $f16                          \n\t"
    "psubh      $f6, $f30, $f18                          \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
    "pcmpgth    $f16, $f24, $f4                          \n\t"
    "pcmpgth    $f18, $f26, $f6                          \n\t"
    "gslqc1     $f6, $f4, 0x30(%[tmp])                   \n\t"
    "psubh      $f4, $f8, $f4                            \n\t"
    "psubh      $f6, $f10, $f6                           \n\t"
    "dmfc1      %[iAlpha], $f28                          \n\t"
    "dmfc1      %[iBeta], $f30                           \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
    "pcmpgth    $f20, $f20, $f4                          \n\t"
    "pcmpgth    $f22, $f22, $f6                          \n\t"
    "gslqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
    "and        $f0, $f0, $f16                           \n\t"
    "and        $f2, $f2, $f18                           \n\t"
    "psubh      $f4, $f4, $f8                            \n\t"
    "psubh      $f6, $f6, $f10                           \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
    "pcmpgth    $f16, $f24, $f4                          \n\t"
    "pcmpgth    $f18, $f26, $f6                          \n\t"
    "gslqc1     $f6, $f4, 0x20(%[tmp])                   \n\t"
    "gslqc1     $f30, $f28, 0x30(%[tmp])                 \n\t"
    "psubh      $f4, $f4, $f28                           \n\t"
    "psubh      $f6, $f6, $f30                           \n\t"
    "and        $f20, $f20, $f16                         \n\t"
    "and        $f22, $f22, $f18                         \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
    "dmtc1      %[iAlpha], $f28                          \n\t"
    "dmtc1      %[iBeta], $f30                           \n\t"
    "pcmpgth    $f24, $f24, $f4                          \n\t"
    "pcmpgth    $f26, $f26, $f6                          \n\t"
    "and        $f20, $f20, $f24                         \n\t"
    "and        $f22, $f22, $f26                         \n\t"
    "dli        %[iBeta], 0x2                            \n\t"
    "dmtc1      %[iBeta], $f4                            \n\t"
    "punpcklhw  $f16, $f4, $f4                           \n\t"
    "punpcklwd  $f4, $f16, $f16                          \n\t"
    "mov.d      $f6, $f4                                 \n\t"
    "gslqc1     $f18, $f16, 0x60(%[tmp])                 \n\t"
    "paddh      $f24, $f16, $f16                         \n\t"
    "paddh      $f26, $f18, $f18                         \n\t"
    "paddh      $f24, $f24, $f12                         \n\t"
    "paddh      $f26, $f26, $f14                         \n\t"
    "paddh      $f24, $f24, $f28                         \n\t"
    "paddh      $f26, $f26, $f30                         \n\t"
    "gssqc1     $f6, $f4, 0x10(%[tmp])                   \n\t"
    "gslqc1     $f18, $f16, 0x10(%[tmp])                 \n\t"
    "paddh      $f24, $f24, $f16                         \n\t"
    "paddh      $f26, $f26, $f18                         \n\t"
    "dmtc1      %[iBeta], $f16                           \n\t"
    "psrah      $f24, $f24, $f16                         \n\t"
    "psrah      $f26, $f26, $f16                         \n\t"
    "pandn      $f16, $f0, $f12                          \n\t"
    "pandn      $f18, $f2, $f14                          \n\t"
    "gslqc1     $f14, $f12, 0x40(%[tmp])                 \n\t"
    "and        $f4, $f0, $f24                           \n\t"
    "and        $f6, $f2, $f26                           \n\t"
    "or         $f4, $f4, $f16                           \n\t"
    "or         $f6, $f6, $f18                           \n\t"
    "paddh      $f24, $f12, $f12                         \n\t"
    "paddh      $f26, $f14, $f14                         \n\t"
    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
    "paddh      $f24, $f24, $f8                          \n\t"
    "paddh      $f26, $f26, $f10                         \n\t"
    "gslqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
    "paddh      $f24, $f24, $f16                         \n\t"
    "paddh      $f26, $f26, $f18                         \n\t"
    "dmtc1      %[iBeta], $f16                           \n\t"
    "paddh      $f24, $f24, $f12                         \n\t"
    "paddh      $f26, $f26, $f14                         \n\t"
    "psrah      $f24, $f24, $f16                         \n\t"
    "psrah      $f26, $f26, $f16                         \n\t"
    "and        $f16, $f20, $f24                         \n\t"
    "and        $f18, $f22, $f26                         \n\t"
    "pandn      $f24, $f20, $f8                          \n\t"
    "pandn      $f26, $f22, $f10                         \n\t"
    "or         $f16, $f16, $f24                         \n\t"
    "or         $f18, $f18, $f26                         \n\t"
    "packushb   $f4, $f4, $f6                            \n\t"
    "packushb   $f6, $f16, $f18                          \n\t"
    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
    "paddh      $f24, $f28, $f28                         \n\t"
    "paddh      $f26, $f30, $f30                         \n\t"
    "paddh      $f24, $f24, $f16                         \n\t"
    "paddh      $f26, $f26, $f18                         \n\t"
    "gslqc1     $f10, $f8, 0x60(%[tmp])                  \n\t"
    "paddh      $f24, $f24, $f8                          \n\t"
    "paddh      $f26, $f26, $f10                         \n\t"
    "dmtc1      %[iBeta], $f28                           \n\t"
    "paddh      $f24, $f24, $f12                         \n\t"
    "paddh      $f26, $f26, $f14                         \n\t"
    "psrah      $f24, $f24, $f28                         \n\t"
    "psrah      $f26, $f26, $f28                         \n\t"
    "and        $f8, $f0, $f24                           \n\t"
    "and        $f10, $f2, $f26                          \n\t"
    "pandn      $f0, $f0, $f16                           \n\t"
    "pandn      $f2, $f2, $f18                           \n\t"
    "or         $f8, $f8, $f0                            \n\t"
    "or         $f10, $f10, $f2                          \n\t"
    "gslqc1     $f2, $f0, 0x20(%[tmp])                   \n\t"
    "paddh      $f24, $f0, $f0                           \n\t"
    "paddh      $f26, $f2, $f2                           \n\t"
    "gslqc1     $f2, $f0, 0x30(%[tmp])                   \n\t"
    "paddh      $f24, $f24, $f0                          \n\t"
    "paddh      $f26, $f26, $f2                          \n\t"
    "gslqc1     $f18, $f16, 0x40(%[tmp])                 \n\t"
    "paddh      $f24, $f24, $f16                         \n\t"
    "paddh      $f26, $f26, $f18                         \n\t"
    "paddh      $f24, $f24, $f12                         \n\t"
    "paddh      $f26, $f26, $f14                         \n\t"
    "gssdxc1    $f4, 0x0($8, $0)                         \n\t"
    "psrah      $f24, $f24, $f28                         \n\t"
    "psrah      $f26, $f26, $f28                         \n\t"
    "and        $f16, $f20, $f24                         \n\t"
    "and        $f18, $f22, $f26                         \n\t"
    "pandn      $f20, $f20, $f0                          \n\t"
    "pandn      $f22, $f22, $f2                          \n\t"
    "or         $f16, $f16, $f20                         \n\t"
    "or         $f18, $f18, $f22                         \n\t"
    "packushb   $f8, $f8, $f10                           \n\t"
    "packushb   $f10, $f16, $f18                         \n\t"
    "gssdxc1    $f8, 0x0(%[pPixCb], $0)                  \n\t"
    "gssdxc1    $f6, 0x0($9, $0)                         \n\t"
    "gssdxc1    $f10, 0x0(%[pPixCr], $0)                 \n\t"
    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
      "$f28", "$f30"
  );
  RECOVER_REG;
}

void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
                           int32_t iAlpha, int32_t iBeta) {
  unsigned char tmp[256] __attribute__((aligned(32)));
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
    "move       $9, %[pPixCb]                             \n\t"
    "move       $10, %[pPixCr]                            \n\t"
    "dsll       $11, %[iStride], 0x2                      \n\t"
    "daddu      %[pPixCb], %[pPixCb], $11                 \n\t"
    "daddu      %[pPixCr], %[pPixCr], $11                 \n\t"
    "daddiu     $11, %[tmp], 0x80                         \n\t"
    "gsldlc1    $f0, 0x7($9)                              \n\t"
    "gsldrc1    $f0, 0x0($9)                              \n\t"
    "daddu      $12, $9, %[iStride]                       \n\t"
    "gsldlc1    $f4, 0x7($12)                             \n\t"
    "gsldrc1    $f4, 0x0($12)                             \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsldlc1    $f8, 0x7($12)                             \n\t"
    "gsldrc1    $f8, 0x0($12)                             \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsldlc1    $f12, 0x7($12)                            \n\t"
    "gsldlc1    $f16, 0x7($10)                            \n\t"
    "gsldrc1    $f12, 0x0($12)                            \n\t"
    "gsldrc1    $f16, 0x0($10)                            \n\t"
    "daddu      $12, $10, %[iStride]                      \n\t"
    "gsldlc1    $f20, 0x7($12)                            \n\t"
    "gsldrc1    $f20, 0x0($12)                            \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsldlc1    $f24, 0x7($12)                            \n\t"
    "gsldrc1    $f24, 0x0($12)                            \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsldlc1    $f28, 0x7($12)                            \n\t"
    "gsldrc1    $f28, 0x0($12)                            \n\t"
    "punpcklwd  $f0, $f0, $f16                            \n\t"
    "punpcklwd  $f4, $f4, $f20                            \n\t"
    "punpcklwd  $f8, $f8, $f24                            \n\t"
    "punpcklwd  $f12, $f12, $f28                          \n\t"
    "gsldlc1    $f16, 0x7(%[pPixCb])                      \n\t"
    "gsldlc1    $f20, 0x7(%[pPixCr])                      \n\t"
    "gsldrc1    $f16, 0x0(%[pPixCb])                      \n\t"
    "gsldrc1    $f20, 0x0(%[pPixCr])                      \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f2, $f16                                 \n\t"
    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
    "gsldlc1    $f16, 0x7($12)                            \n\t"
    "gsldlc1    $f20, 0x7($13)                            \n\t"
    "gsldrc1    $f16, 0x0($12)                            \n\t"
    "gsldrc1    $f20, 0x0($13)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f6, $f16                                 \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "daddu      $13, $13, %[iStride]                      \n\t"
    "gsldlc1    $f16, 0x7($12)                            \n\t"
    "gsldlc1    $f20, 0x7($13)                            \n\t"
    "gsldrc1    $f16, 0x0($12)                            \n\t"
    "gsldrc1    $f20, 0x0($13)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f10, $f16                                \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "daddu      $13, $13, %[iStride]                      \n\t"
    "gsldlc1    $f16, 0x7($12)                            \n\t"
    "gsldlc1    $f20, 0x7($13)                            \n\t"
    "gsldrc1    $f16, 0x0($12)                            \n\t"
    "gsldrc1    $f20, 0x0($13)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f14, $f16                                \n\t"
    "punpcklbh  $f24, $f2, $f6                            \n\t"
    "punpckhbh  $f26, $f2, $f6                            \n\t"
    "punpckhbh  $f2, $f0, $f4                             \n\t"
    "punpcklbh  $f0, $f0, $f4                             \n\t"
    "punpcklbh  $f28, $f10, $f14                          \n\t"
    "punpckhbh  $f30, $f10, $f14                          \n\t"
    "punpckhbh  $f10, $f8, $f12                           \n\t"
    "punpcklbh  $f8, $f8, $f12                            \n\t"
    "punpcklhw  $f16, $f2, $f10                           \n\t"
    "punpckhhw  $f18, $f2, $f10                           \n\t"
    "punpckhhw  $f2, $f0, $f8                             \n\t"
    "punpcklhw  $f0, $f0, $f8                             \n\t"
    "punpcklhw  $f20, $f26, $f30                          \n\t"
    "punpckhhw  $f22, $f26, $f30                          \n\t"
    "punpckhhw  $f26, $f24, $f28                          \n\t"
    "punpcklhw  $f24, $f24, $f28                          \n\t"
    "punpcklwd  $f4, $f2, $f26                            \n\t"
    "punpckhwd  $f6, $f2, $f26                            \n\t"
    "punpckhwd  $f2, $f0, $f24                            \n\t"
    "punpcklwd  $f0, $f0, $f24                            \n\t"
    "punpcklwd  $f8, $f18, $f22                           \n\t"
    "punpckhwd  $f10, $f18, $f22                          \n\t"
    "punpckhwd  $f18, $f16, $f20                          \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f20, $f2                                 \n\t"
    "mov.d      $f22, $f18                                \n\t"
    "mov.d      $f2, $f16                                 \n\t"
    "mov.d      $f24, $f6                                 \n\t"
    "mov.d      $f26, $f10                                \n\t"
    "mov.d      $f6, $f8                                  \n\t"
    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
    "gslqc1     $f18, $f16, 0x90(%[tmp])                  \n\t"
    "gslqc1     $f22, $f20, 0xa0(%[tmp])                  \n\t"
    "gslqc1     $f30, $f28, 0xb0(%[tmp])                  \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "dmtc1      %[iAlpha], $f4                            \n\t"
    "punpcklhw  $f8, $f4, $f4                             \n\t"
    "punpcklwd  $f4, $f8, $f8                             \n\t"
    "mov.d      $f6, $f4                                  \n\t"
    "dmtc1      %[iBeta], $f8                             \n\t"
    "punpcklhw  $f12, $f8, $f8                            \n\t"
    "punpcklwd  $f8, $f12, $f12                           \n\t"
    "mov.d      $f10, $f8                                 \n\t"
    "mov.d      $f12, $f24                                \n\t"
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0xa0(%[tmp])                  \n\t"
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0xb0(%[tmp])                  \n\t"
    "punpcklbh  $f24, $f26, $f0                           \n\t"
    "punpckhbh  $f26, $f26, $f0                           \n\t"
    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
    "punpckhbh  $f30, $f28, $f0                           \n\t"
    "punpcklbh  $f28, $f28, $f0                           \n\t"
    "punpckhbh  $f18, $f16, $f0                           \n\t"
    "punpcklbh  $f16, $f16, $f0                           \n\t"
    "punpckhbh  $f22, $f20, $f0                           \n\t"
    "punpcklbh  $f20, $f20, $f0                           \n\t"
    "punpckhbh  $f14, $f12, $f0                           \n\t"
    "punpcklbh  $f12, $f12, $f0                           \n\t"
    "gssqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "psubh      $f24, $f16, $f20                          \n\t"
    "psubh      $f26, $f18, $f22                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
    "pcmpgth    $f0, $f4, $f24                            \n\t"
    "pcmpgth    $f2, $f6, $f26                            \n\t"
    "psubh      $f24, $f12, $f16                          \n\t"
    "psubh      $f26, $f14, $f18                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
    "pcmpgth    $f28, $f8, $f24                           \n\t"
    "pcmpgth    $f30, $f10, $f26                          \n\t"
    "gslqc1     $f26, $f24, 0x50(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f20                          \n\t"
    "psubh      $f26, $f26, $f22                          \n\t"
    "and        $f0, $f0, $f28                            \n\t"
    "and        $f2, $f2, $f30                            \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
    "dmfc1      %[iAlpha], $f20                           \n\t"
    "dmfc1      %[iBeta], $f22                            \n\t"
    "pcmpgth    $f28, $f8, $f24                           \n\t"
    "pcmpgth    $f30, $f10, $f26                          \n\t"
    "gslqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f20                          \n\t"
    "psubh      $f26, $f26, $f22                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
    "pcmpgth    $f4, $f4, $f24                            \n\t"
    "pcmpgth    $f6, $f6, $f26                            \n\t"
    "gslqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f20                          \n\t"
    "psubh      $f26, $f26, $f22                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
    "and        $f0, $f0, $f28                            \n\t"
    "and        $f2, $f2, $f30                            \n\t"
    "pcmpgth    $f28, $f8, $f24                           \n\t"
    "pcmpgth    $f30, $f10, $f26                          \n\t"
    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f20                          \n\t"
    "psubh      $f26, $f26, $f22                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
    "dli        $8, 0x2                                   \n\t"
    "and        $f4, $f4, $f28                            \n\t"
    "and        $f6, $f6, $f30                            \n\t"
    "pcmpgth    $f8, $f8, $f24                            \n\t"
    "pcmpgth    $f10, $f10, $f26                          \n\t"
    "and        $f4, $f4, $f8                             \n\t"
    "and        $f6, $f6, $f10                            \n\t"
    "dmtc1      $8, $f8                                   \n\t"
    "punpcklhw  $f24, $f8, $f8                            \n\t"
    "punpcklwd  $f8, $f24, $f24                           \n\t"
    "mov.d      $f10, $f8                                 \n\t"
    "gssqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
    "paddh      $f8, $f12, $f12                           \n\t"
    "paddh      $f10, $f14, $f14                          \n\t"
    "paddh      $f8, $f8, $f16                            \n\t"
    "paddh      $f10, $f10, $f18                          \n\t"
    "gslqc1     $f22, $f20, 0x50(%[tmp])                  \n\t"
    "paddh      $f8, $f8, $f20                            \n\t"
    "paddh      $f10, $f10, $f22                          \n\t"
    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
    "paddh      $f8, $f8, $f24                            \n\t"
    "paddh      $f10, $f10, $f26                          \n\t"
    "dmtc1      $8, $f20                                  \n\t"
    "psrah      $f8, $f8, $f20                            \n\t"
    "psrah      $f10, $f10, $f20                          \n\t"
    "and        $f24, $f0, $f8                            \n\t"
    "and        $f26, $f2, $f10                           \n\t"
    "pandn      $f8, $f0, $f16                            \n\t"
    "pandn      $f10, $f2, $f18                           \n\t"
    "or         $f24, $f24, $f8                           \n\t"
    "or         $f26, $f26, $f10                          \n\t"
    "gslqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
    "paddh      $f28, $f8, $f8                            \n\t"
    "paddh      $f30, $f10, $f10                          \n\t"
    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
    "paddh      $f28, $f28, $f20                          \n\t"
    "paddh      $f30, $f30, $f22                          \n\t"
    "gslqc1     $f18, $f16, 0x70(%[tmp])                  \n\t"
    "paddh      $f28, $f28, $f16                          \n\t"
    "paddh      $f30, $f30, $f18                          \n\t"
    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
    "paddh      $f28, $f28, $f8                           \n\t"
    "paddh      $f30, $f30, $f10                          \n\t"
    "pandn      $f8, $f4, $f20                            \n\t"
    "pandn      $f10, $f6, $f22                           \n\t"
    "dmtc1      $8, $f20                                  \n\t"
    "psrah      $f28, $f28, $f20                          \n\t"
    "psrah      $f30, $f30, $f20                          \n\t"
    "and        $f16, $f4, $f28                           \n\t"
    "and        $f18, $f6, $f30                           \n\t"
    "or         $f16, $f16, $f8                           \n\t"
    "or         $f18, $f18, $f10                          \n\t"
    "gslqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
    "packushb   $f24, $f24, $f26                          \n\t"
    "packushb   $f26, $f16, $f18                          \n\t"
    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
    "paddh      $f24, $f8, $f8                            \n\t"
    "paddh      $f26, $f10, $f10                          \n\t"
    "dmtc1      %[iAlpha], $f20                           \n\t"
    "dmtc1      %[iBeta], $f22                            \n\t"
    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
    "paddh      $f24, $f24, $f20                          \n\t"
    "paddh      $f26, $f26, $f22                          \n\t"
    "paddh      $f24, $f24, $f12                          \n\t"
    "paddh      $f26, $f26, $f14                          \n\t"
    "mov.d      $f16, $f0                                 \n\t"
    "mov.d      $f18, $f2                                 \n\t"
    "pandn      $f0, $f0, $f20                            \n\t"
    "pandn      $f2, $f2, $f22                            \n\t"
    "dmtc1      $8, $f20                                  \n\t"
    "paddh      $f24, $f24, $f8                           \n\t"
    "paddh      $f26, $f26, $f10                          \n\t"
    "psrah      $f24, $f24, $f20                          \n\t"
    "psrah      $f26, $f26, $f20                          \n\t"
    "and        $f16, $f16, $f24                          \n\t"
    "and        $f18, $f18, $f26                          \n\t"
    "or         $f16, $f16, $f0                           \n\t"
    "or         $f18, $f18, $f2                           \n\t"
    "gslqc1     $f2, $f0, 0x70(%[tmp])                    \n\t"
    "paddh      $f20, $f0, $f0                            \n\t"
    "paddh      $f22, $f2, $f2                            \n\t"
    "gslqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
    "paddh      $f20, $f20, $f0                           \n\t"
    "paddh      $f22, $f22, $f2                           \n\t"
    "gslqc1     $f14, $f12, 0x60(%[tmp])                  \n\t"
    "paddh      $f20, $f20, $f12                          \n\t"
    "paddh      $f22, $f22, $f14                          \n\t"
    "paddh      $f20, $f20, $f8                           \n\t"
    "paddh      $f22, $f22, $f10                          \n\t"
    "dmtc1      $8, $f8                                   \n\t"
    "psrah      $f20, $f20, $f8                           \n\t"
    "psrah      $f22, $f22, $f8                           \n\t"
    "and        $f12, $f4, $f20                           \n\t"
    "and        $f14, $f6, $f22                           \n\t"
    "pandn      $f4, $f4, $f0                             \n\t"
    "pandn      $f6, $f6, $f2                             \n\t"
    "or         $f12, $f12, $f4                           \n\t"
    "or         $f14, $f14, $f6                           \n\t"
    "packushb   $f16, $f16, $f18                          \n\t"
    "packushb   $f18, $f12, $f14                          \n\t"
    "gssqc1     $f18, $f16, 0xa0(%[tmp])                  \n\t"
    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
    "mov.d      $f26, $f2                                 \n\t"
    "punpckhbh  $f2, $f0, $f4                             \n\t"
    "punpcklbh  $f0, $f0, $f4                             \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"
    "mov.d      $f30, $f10                                \n\t"
    "punpckhbh  $f10, $f8, $f12                           \n\t"
    "punpcklbh  $f8, $f8, $f12                            \n\t"
    "punpcklbh  $f28, $f30, $f14                          \n\t"
    "punpckhbh  $f30, $f30, $f14                          \n\t"
    "punpcklhw  $f16, $f2, $f10                           \n\t"
    "punpckhhw  $f18, $f2, $f10                           \n\t"
    "punpcklhw  $f20, $f26, $f30                          \n\t"
    "punpckhhw  $f22, $f26, $f30                          \n\t"
    "punpckhhw  $f2, $f0, $f8                             \n\t"
    "punpcklhw  $f0, $f0, $f8                             \n\t"
    "punpckhhw  $f26, $f24, $f28                          \n\t"
    "punpcklhw  $f24, $f24, $f28                          \n\t"
    "punpcklwd  $f4, $f2, $f26                            \n\t"
    "punpckhwd  $f6, $f2, $f26                            \n\t"
    "punpcklwd  $f8, $f18, $f22                           \n\t"
    "punpckhwd  $f10, $f18, $f22                          \n\t"
    "punpckhwd  $f2, $f0, $f24                            \n\t"
    "punpcklwd  $f0, $f0, $f24                            \n\t"
    "punpckhwd  $f18, $f16, $f20                          \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f20, $f2                                 \n\t"
    "mov.d      $f24, $f6                                 \n\t"
    "mov.d      $f2, $f16                                 \n\t"
    "mov.d      $f22, $f18                                \n\t"
    "mov.d      $f6, $f8                                  \n\t"
    "mov.d      $f26, $f10                                \n\t"
    "dli        %[iAlpha], 0x20                           \n\t"
    "dmtc1      %[iAlpha], $f8                            \n\t"
    "gsswlc1    $f0, 0x3($9)                              \n\t"
    "gsswrc1    $f0, 0x0($9)                              \n\t"
    "daddu      $12, $9, %[iStride]                       \n\t"
    "gsswlc1    $f20, 0x3($12)                            \n\t"
    "gsswrc1    $f20, 0x0($12)                            \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsswlc1    $f4, 0x3($12)                             \n\t"
    "gsswrc1    $f4, 0x0($12)                             \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsswlc1    $f24, 0x3($12)                            \n\t"
    "gsswrc1    $f24, 0x0($12)                            \n\t"
    "dsrl       $f0, $f0, $f8                             \n\t"
    "dsrl       $f20, $f20, $f8                           \n\t"
    "dsrl       $f4, $f4, $f8                             \n\t"
    "dsrl       $f24, $f24, $f8                           \n\t"
    "gsswlc1    $f0, 0x3($10)                             \n\t"
    "gsswrc1    $f0, 0x0($10)                             \n\t"
    "daddu      $13, $10, %[iStride]                      \n\t"
    "daddu      $8, $13, %[iStride]                       \n\t"
    "gsswlc1    $f20, 0x3($13)                            \n\t"
    "gsswrc1    $f20, 0x0($13)                            \n\t"
    "daddu      $13, $8, %[iStride]                       \n\t"
    "gsswlc1    $f4, 0x3($8)                              \n\t"
    "gsswrc1    $f4, 0x0($8)                              \n\t"
    "gsswlc1    $f24, 0x3($13)                            \n\t"
    "gsswrc1    $f24, 0x0($13)                            \n\t"
    "gsswlc1    $f2, 0x3(%[pPixCb])                       \n\t"
    "gsswrc1    $f2, 0x0(%[pPixCb])                       \n\t"
    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
    "gsswlc1    $f22, 0x3($12)                            \n\t"
    "gsswrc1    $f22, 0x0($12)                            \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsswlc1    $f6, 0x3($12)                             \n\t"
    "gsswrc1    $f6, 0x0($12)                             \n\t"
    "daddu      $12, $12, %[iStride]                      \n\t"
    "gsswlc1    $f26, 0x3($12)                            \n\t"
    "gsswrc1    $f26, 0x0($12)                            \n\t"
    "dsrl       $f2, $f2, $f8                             \n\t"
    "dsrl       $f22, $f22, $f8                           \n\t"
    "dsrl       $f6, $f6, $f8                             \n\t"
    "dsrl       $f26, $f26, $f8                           \n\t"
    "gsswlc1    $f2, 0x3(%[pPixCr])                       \n\t"
    "gsswrc1    $f2, 0x0(%[pPixCr])                       \n\t"
    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
    "daddu      $8, $13, %[iStride]                       \n\t"
    "gsswlc1    $f22, 0x3($13)                            \n\t"
    "gsswrc1    $f22, 0x0($13)                            \n\t"
    "daddu      $13, $8, %[iStride]                       \n\t"
    "gsswlc1    $f6, 0x3($8)                              \n\t"
    "gsswrc1    $f6, 0x0($8)                              \n\t"
    "gsswlc1    $f26, 0x3($13)                            \n\t"
    "gsswrc1    $f26, 0x0($13)                            \n\t"
    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
      "$f24", "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
  unsigned char tmp[320] __attribute__((aligned(32)));
  BACKUP_REG;
  __asm__ volatile (
    ".set       arch=loongson3a                           \n\t"
    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
    "gsldlc1    $f0, 0x7(%[pPixCb])                       \n\t"
    "gsldlc1    $f4, 0x7($8)                              \n\t"
    "gsldrc1    $f0, 0x0(%[pPixCb])                       \n\t"
    "gsldrc1    $f4, 0x0($8)                              \n\t"
    "daddu      $9, $8, %[iStride]                        \n\t"
    "daddu      $8, $9, %[iStride]                        \n\t"
    "gsldlc1    $f8, 0x7($9)                              \n\t"
    "gsldlc1    $f12, 0x7($8)                             \n\t"
    "gsldrc1    $f8, 0x0($9)                              \n\t"
    "gsldrc1    $f12, 0x0($8)                             \n\t"
    "daddu      $9, $8, %[iStride]                        \n\t"

    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
    "gsldlc1    $f16, 0x7(%[pPixCr])                      \n\t"
    "gsldlc1    $f20, 0x7($10)                            \n\t"
    "gsldrc1    $f16, 0x0(%[pPixCr])                      \n\t"
    "gsldrc1    $f20, 0x0($10)                            \n\t"
    "daddu      $11, $10, %[iStride]                      \n\t"
    "daddu      $10, $11, %[iStride]                      \n\t"
    "gsldlc1    $f24, 0x7($11)                            \n\t"
    "gsldlc1    $f28, 0x7($10)                            \n\t"
    "gsldrc1    $f24, 0x0($11)                            \n\t"
    "gsldrc1    $f28, 0x0($10)                            \n\t"
    "daddu      $11, $10, %[iStride]                      \n\t"

    "punpcklwd  $f0, $f0, $f16                            \n\t"
    "punpcklwd  $f4, $f4, $f20                            \n\t"
    "punpcklwd  $f8, $f8, $f24                            \n\t"
    "punpcklwd  $f12, $f12, $f28                          \n\t"
    "gsldlc1    $f16, 0x7($9)                             \n\t"
    "gsldlc1    $f20, 0x7($11)                            \n\t"
    "gsldrc1    $f16, 0x0($9)                             \n\t"
    "gsldrc1    $f20, 0x0($11)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f2, $f16                                 \n\t"
    "daddu      $8, $9, %[iStride]                        \n\t"
    "daddu      $10, $11, %[iStride]                      \n\t"
    "gsldlc1    $f16, 0x7($8)                             \n\t"
    "gsldlc1    $f20, 0x7($10)                            \n\t"
    "gsldrc1    $f16, 0x0($8)                             \n\t"
    "gsldrc1    $f20, 0x0($10)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f6, $f16                                 \n\t"
    "daddu      $9, $8, %[iStride]                        \n\t"
    "daddu      $11, $10, %[iStride]                      \n\t"

    "gsldlc1    $f16, 0x7($9)                             \n\t"
    "gsldlc1    $f20, 0x7($11)                            \n\t"
    "gsldrc1    $f16, 0x0($9)                             \n\t"
    "gsldrc1    $f20, 0x0($11)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f10, $f16                                \n\t"
    "daddu      $8, $9, %[iStride]                        \n\t"
    "daddu      $10, $11, %[iStride]                      \n\t"

    "gsldlc1    $f16, 0x7($8)                             \n\t"
    "gsldlc1    $f20, 0x7($10)                            \n\t"
    "gsldrc1    $f16, 0x0($8)                             \n\t"
    "gsldrc1    $f20, 0x0($10)                            \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"
    "mov.d      $f14, $f16                                \n\t"

    "punpcklbh  $f24, $f2, $f6                            \n\t"
    "punpckhbh  $f26, $f2, $f6                            \n\t"
    "punpckhbh  $f2, $f0, $f4                             \n\t"
    "punpcklbh  $f0, $f0, $f4                             \n\t"
    "punpcklbh  $f28, $f10, $f14                          \n\t"
    "punpckhbh  $f30, $f10, $f14                          \n\t"
    "punpckhbh  $f10, $f8, $f12                           \n\t"
    "punpcklbh  $f8, $f8, $f12                            \n\t"

    "punpcklhw  $f16, $f2, $f10                           \n\t"
    "punpckhhw  $f18, $f2, $f10                           \n\t"
    "punpckhhw  $f2, $f0, $f8                             \n\t"
    "punpcklhw  $f0, $f0, $f8                             \n\t"
    "punpcklhw  $f20, $f26, $f30                          \n\t"
    "punpckhhw  $f22, $f26, $f30                          \n\t"
    "punpckhhw  $f26, $f24, $f28                          \n\t"
    "punpcklhw  $f24, $f24, $f28                          \n\t"

    "punpcklwd  $f4, $f2, $f26                            \n\t"
    "punpckhwd  $f6, $f2, $f26                            \n\t"
    "punpckhwd  $f2, $f0, $f24                            \n\t"
    "punpcklwd  $f0, $f0, $f24                            \n\t"
    "punpcklwd  $f8, $f18, $f22                           \n\t"
    "punpckhwd  $f10, $f18, $f22                          \n\t"
    "punpckhwd  $f18, $f16, $f20                          \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"

    "mov.d      $f20, $f2                                 \n\t"
    "mov.d      $f22, $f18                                \n\t"
    "mov.d      $f2, $f16                                 \n\t"
    "mov.d      $f24, $f6                                 \n\t"
    "mov.d      $f26, $f10                                \n\t"
    "mov.d      $f6, $f8                                  \n\t"
    "daddiu     $11, %[tmp], 0x70                         \n\t"

    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"

    "lb         $8, 0x3(%[pTC])                           \n\t"
    "lb         $9, 0x2(%[pTC])                           \n\t"
    "lb         $10, 0x1(%[pTC])                          \n\t"
    "lb         $11, 0x0(%[pTC])                          \n\t"

    "and        $12, $8, 0xFFFF                           \n\t"
    "dmtc1      $12, $f8                                  \n\t"

    "and        $9, $9, 0xFFFF                            \n\t"
    "dmtc1      $9, $f12                                  \n\t"
    "mov.d      $f16, $f12                                \n\t"

    "and        $9, $10, 0xFFFF                           \n\t"
    "dmtc1      $9, $f20                                  \n\t"
    "xor        $f0, $f0, $f0                             \n\t"
    "mov.d      $f24, $f20                                \n\t"
    "and        $9, $11, 0xFFFF                           \n\t"
    "punpcklhw  $f24, $f24, $f8                           \n\t"

    "mov.d      $f4, $f8                                  \n\t"
    "dmtc1      $9, $f28                                  \n\t"
    "mov.d      $f0, $f28                                 \n\t"

    "punpcklhw  $f28, $f28, $f12                          \n\t"
    "punpcklhw  $f20, $f20, $f4                           \n\t"
    "xor        $f4, $f4, $f4                             \n\t"
    "xor        $f6, $f6, $f6                             \n\t"
    "punpcklhw  $f28, $f28, $f20                          \n\t"
    "gslqc1     $f22, $f20, 0xA0(%[tmp])                  \n\t"
    "punpcklhw  $f0, $f0, $f16                            \n\t"
    "punpcklhw  $f0, $f0, $f24                            \n\t"

    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
    "punpckhhw  $f2, $f0, $f28                            \n\t"
    "punpcklhw  $f0, $f0, $f28                            \n\t"
    "gslqc1     $f30, $f28, 0x80(%[tmp])                  \n\t"
    "psubh      $f8, $f4, $f0                             \n\t"
    "psubh      $f10, $f6, $f2                            \n\t"
    "gssqc1     $f10, $f8, 0xD0(%[tmp])                   \n\t"
    "dmtc1      %[iAlpha], $f8                            \n\t"
    "punpcklhw  $f12, $f8, $f8                            \n\t"
    "punpcklwd  $f16, $f12, $f12                          \n\t"
    "mov.d      $f18, $f16                                \n\t"

    "dmtc1      %[iBeta], $f8                             \n\t"
    "punpcklhw  $f12, $f8, $f8                            \n\t"
    "punpcklwd  $f8, $f12, $f12                           \n\t"
    "mov.d      $f10, $f8                                 \n\t"

    "gslqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
    "punpckhbh  $f10, $f24, $f4                           \n\t"
    "punpcklbh  $f8, $f24, $f4                            \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"

    "gssqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
    "gssqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
    "punpcklbh  $f8, $f28, $f4                            \n\t"
    "punpckhbh  $f10, $f28, $f4                           \n\t"
    "punpcklbh  $f28, $f30, $f6                           \n\t"
    "punpckhbh  $f30, $f30, $f6                           \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"
    "punpckhbh  $f14, $f12, $f4                           \n\t"
    "punpcklbh  $f12, $f12, $f4                           \n\t"
    "punpckhbh  $f22, $f20, $f4                           \n\t"
    "punpcklbh  $f20, $f20, $f4                           \n\t"
    "gssqc1     $f30, $f28, 0xF0(%[tmp])                  \n\t"
    "gssqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0xA0(%[tmp])                  \n\t"
    "punpcklbh  $f24, $f26, $f6                           \n\t"
    "punpckhbh  $f26, $f26, $f6                           \n\t"

    "dli        $13, 0x4                                  \n\t"
    "gssqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
    "dmtc1      $13, $f24                                 \n\t"
    "punpcklhw  $f28, $f24, $f24                          \n\t"
    "punpcklwd  $f24, $f28, $f28                          \n\t"
    "mov.d      $f26, $f24                                \n\t"
    "dli        $12, 0x2                                  \n\t"
    "dli        $13, 0x3                                  \n\t"

    "gssqc1     $f2, $f0, 0x20(%[tmp])                    \n\t"
    "dmfc1      %[iAlpha], $f0                            \n\t"
    "dmfc1      %[iBeta], $f2                             \n\t"
    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
    "gslqc1     $f30, $f28, 0x40(%[tmp])                  \n\t"
    "psubh      $f28, $f28, $f20                          \n\t"
    "psubh      $f30, $f30, $f22                          \n\t"
    "pcmpgth    $f24, $f0, $f4                            \n\t"
    "pcmpgth    $f26, $f2, $f6                            \n\t"

    "dmtc1      $12, $f0                                  \n\t"
    "dmtc1      $13, $f2                                  \n\t"
    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
    "gslqc1     $f6, $f4, 0xD0(%[tmp])                    \n\t"
    "psubh      $f24, $f12, $f8                           \n\t"
    "psubh      $f26, $f14, $f10                          \n\t"
    "psllh      $f24, $f24, $f0                           \n\t"
    "psllh      $f26, $f26, $f0                           \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
    "paddh      $f24, $f24, $f28                          \n\t"
    "paddh      $f26, $f26, $f30                          \n\t"
    "psrah      $f24, $f24, $f2                           \n\t"
    "psrah      $f26, $f26, $f2                           \n\t"
    "pmaxsh     $f4, $f4, $f24                            \n\t"
    "pmaxsh     $f6, $f6, $f26                            \n\t"

    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
    "pminsh     $f24, $f24, $f4                           \n\t"
    "pminsh     $f26, $f26, $f6                           \n\t"

    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
    "psubh      $f4, $f8, $f12                            \n\t"
    "psubh      $f6, $f10, $f14                           \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
    "pcmpgth    $f24, $f16, $f4                           \n\t"
    "pcmpgth    $f26, $f18, $f6                           \n\t"
    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
    "psubh      $f4, $f4, $f8                             \n\t"
    "psubh      $f6, $f6, $f10                            \n\t"
    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
    "pcmpgth    $f28, $f28, $f4                           \n\t"
    "pcmpgth    $f30, $f30, $f6                           \n\t"

    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
    "and        $f24, $f24, $f28                          \n\t"
    "and        $f26, $f26, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "psubh      $f20, $f20, $f12                          \n\t"
    "psubh      $f22, $f22, $f14                          \n\t"
    WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
    "pcmpgth    $f4, $f4, $f20                            \n\t"
    "pcmpgth    $f6, $f6, $f22                            \n\t"

    "gslqc1     $f22, $f20, 0xB0(%[tmp])                  \n\t"
    "gslqc1     $f2, $f0, 0xE0(%[tmp])                    \n\t"
    "psubh      $f20, $f20, $f0                           \n\t"
    "psubh      $f22, $f22, $f2                           \n\t"
    "and        $f24, $f24, $f4                           \n\t"
    "and        $f26, $f26, $f6                           \n\t"
    "gslqc1     $f2, $f0, 0x60(%[tmp])                    \n\t"
    "and        $f24, $f24, $f0                           \n\t"
    "and        $f26, $f26, $f2                           \n\t"

    "gslqc1     $f6, $f4, 0x20(%[tmp])                    \n\t"
    "and        $f4, $f4, $f24                            \n\t"
    "and        $f6, $f6, $f26                            \n\t"
    "gslqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
    "gssqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
    "gslqc1     $f6, $f4, 0xF0(%[tmp])                    \n\t"

    "dmtc1      $12, $f0                                  \n\t"
    "psubh      $f24, $f24, $f4                           \n\t"
    "psubh      $f26, $f26, $f6                           \n\t"
    "psllh      $f24, $f24, $f0                           \n\t"
    "psllh      $f26, $f26, $f0                           \n\t"
    "paddh      $f24, $f24, $f20                          \n\t"
    "paddh      $f26, $f26, $f22                          \n\t"
    "gslqc1     $f2, $f0, 0x30(%[tmp])                    \n\t"
    "paddh      $f24, $f24, $f0                           \n\t"
    "paddh      $f26, $f26, $f2                           \n\t"
    "dmtc1      %[iBeta], $f2                             \n\t"

    "dmtc1      $13, $f0                                  \n\t"
    "gslqc1     $f22, $f20, 0xD0(%[tmp])                  \n\t"
    "psrah      $f24, $f24, $f0                           \n\t"
    "psrah      $f26, $f26, $f0                           \n\t"
    "dmtc1      %[iAlpha], $f0                            \n\t"
    "pmaxsh     $f20, $f20, $f24                          \n\t"
    "pmaxsh     $f22, $f22, $f26                          \n\t"
    "pminsh     $f0, $f0, $f20                            \n\t"
    "pminsh     $f2, $f2, $f22                            \n\t"

    "dmfc1      %[iAlpha], $f0                            \n\t"
    "dmfc1      %[iBeta], $f2                             \n\t"
    "gslqc1     $f22, $f20, 0xC0(%[tmp])                  \n\t"
    "psubh      $f24, $f4, $f20                           \n\t"
    "psubh      $f26, $f6, $f22                           \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
    "pcmpgth    $f16, $f16, $f24                          \n\t"
    "pcmpgth    $f18, $f18, $f26                          \n\t"

    "gslqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f4                           \n\t"
    "psubh      $f26, $f26, $f6                           \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
    "pcmpgth    $f28, $f28, $f24                          \n\t"
    "pcmpgth    $f30, $f30, $f26                          \n\t"

    "gslqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
    "and        $f16, $f16, $f28                          \n\t"
    "and        $f18, $f18, $f30                          \n\t"

    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
    "psubh      $f24, $f24, $f20                          \n\t"
    "psubh      $f26, $f26, $f22                          \n\t"
    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
    "pcmpgth    $f28, $f28, $f24                          \n\t"
    "pcmpgth    $f30, $f30, $f26                          \n\t"
    "and        $f16, $f16, $f28                          \n\t"
    "and        $f18, $f18, $f30                          \n\t"
    "gslqc1     $f30, $f28, 0x60(%[tmp])                  \n\t"
    "dmtc1      %[iAlpha], $f0                            \n\t"
    "dmtc1      %[iBeta], $f2                             \n\t"
    "and        $f16, $f16, $f28                          \n\t"
    "and        $f18, $f18, $f30                          \n\t"
    "and        $f0, $f0, $f16                            \n\t"
    "and        $f2, $f2, $f18                            \n\t"

    "gslqc1     $f18, $f16, 0x40(%[tmp])                  \n\t"
    "paddh      $f8, $f8, $f16                            \n\t"
    "paddh      $f10, $f10, $f18                          \n\t"
    "paddh      $f4, $f4, $f0                             \n\t"
    "paddh      $f6, $f6, $f2                             \n\t"
    "psubh      $f12, $f12, $f16                          \n\t"
    "psubh      $f14, $f14, $f18                          \n\t"
    "psubh      $f20, $f20, $f0                           \n\t"
    "psubh      $f22, $f22, $f2                           \n\t"
    "packushb   $f8, $f8, $f10                            \n\t"
    "packushb   $f10, $f4, $f6                            \n\t"
    "packushb   $f12, $f12, $f14                          \n\t"
    "packushb   $f14, $f20, $f22                          \n\t"

    "gssqc1     $f10, $f8, 0x80(%[tmp])                   \n\t"
    "gssqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
    "daddiu     $11, %[tmp], 0x70                         \n\t"

    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"

    "punpcklbh  $f24, $f2, $f6                            \n\t"
    "punpckhbh  $f26, $f2, $f6                            \n\t"
    "punpckhbh  $f2, $f0, $f4                             \n\t"
    "punpcklbh  $f0, $f0, $f4                             \n\t"

    "punpcklbh  $f28, $f10, $f14                          \n\t"
    "punpckhbh  $f30, $f10, $f14                          \n\t"
    "punpckhbh  $f10, $f8, $f12                           \n\t"
    "punpcklbh  $f8, $f8, $f12                            \n\t"

    "punpcklhw  $f16, $f2, $f10                           \n\t"
    "punpckhhw  $f18, $f2, $f10                           \n\t"
    "punpckhhw  $f2, $f0, $f8                             \n\t"
    "punpcklhw  $f0, $f0, $f8                             \n\t"
    "punpcklhw  $f20, $f26, $f30                          \n\t"
    "punpckhhw  $f22, $f26, $f30                          \n\t"
    "punpckhhw  $f26, $f24, $f28                          \n\t"
    "punpcklhw  $f24, $f24, $f28                          \n\t"

    "punpcklwd  $f4, $f2, $f26                            \n\t"
    "punpckhwd  $f6, $f2, $f26                            \n\t"
    "punpckhwd  $f2, $f0, $f24                            \n\t"
    "punpcklwd  $f0, $f0, $f24                            \n\t"
    "punpcklwd  $f8, $f18, $f22                           \n\t"
    "punpckhwd  $f10, $f18, $f22                          \n\t"
    "punpckhwd  $f18, $f16, $f20                          \n\t"
    "punpcklwd  $f16, $f16, $f20                          \n\t"

    "mov.d      $f20, $f2                                 \n\t"
    "mov.d      $f22, $f18                                \n\t"
    "mov.d      $f2, $f16                                 \n\t"
    "mov.d      $f24, $f6                                 \n\t"
    "mov.d      $f26, $f10                                \n\t"
    "mov.d      $f6, $f8                                  \n\t"

    "dli        %[iAlpha], 0x20                           \n\t"
    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
    "gsswlc1    $f0, 0x3(%[pPixCb])                       \n\t"
    "gsswlc1    $f20, 0x3($8)                             \n\t"
    "gsswrc1    $f0, 0x0(%[pPixCb])                       \n\t"
    "gsswrc1    $f20, 0x0($8)                             \n\t"
    "daddu      $9, $8, %[iStride]                        \n\t"
    "daddu      $8, $9, %[iStride]                        \n\t"
    "gsswlc1    $f4, 0x3($9)                              \n\t"
    "gsswlc1    $f24, 0x3($8)                             \n\t"
    "gsswrc1    $f4, 0x0($9)                              \n\t"
    "gsswrc1    $f24, 0x0($8)                             \n\t"
    "daddu      $9, $8, %[iStride]                        \n\t"
    "dmtc1      %[iAlpha], $f8                            \n\t"

    "dsrl       $f0, $f0, $f8                             \n\t"
    "dsrl       $f20, $f20, $f8                           \n\t"
    "dsrl       $f4, $f4, $f8                             \n\t"
    "dsrl       $f24, $f24, $f8                           \n\t"
    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
    "gsswlc1    $f0, 0x3(%[pPixCr])                       \n\t"
    "gsswlc1    $f20, 0x3($10)                            \n\t"
    "gsswrc1    $f0, 0x0(%[pPixCr])                       \n\t"
    "gsswrc1    $f20, 0x0($10)                            \n\t"
    "daddu      $11, $10, %[iStride]                      \n\t"
    "daddu      $10, $11, %[iStride]                      \n\t"
    "gsswlc1    $f4, 0x3($11)                             \n\t"
    "gsswlc1    $f24, 0x3($10)                            \n\t"
    "gsswrc1    $f4, 0x0($11)                             \n\t"
    "gsswrc1    $f24, 0x0($10)                            \n\t"
    "daddu      $11, $10, %[iStride]                      \n\t"

    "daddu      $8, $9, %[iStride]                        \n\t"
    "gsswlc1    $f2, 0x3($9)                              \n\t"
    "gsswlc1    $f22, 0x3($8)                             \n\t"
    "gsswrc1    $f2, 0x0($9)                              \n\t"
    "gsswrc1    $f22, 0x0($8)                             \n\t"
    "daddu      $9, $8, %[iStride]                        \n\t"
    "daddu      $8, $9, %[iStride]                        \n\t"
    "gsswlc1    $f6, 0x3($9)                              \n\t"
    "gsswlc1    $f26, 0x3($8)                             \n\t"
    "gsswrc1    $f6, 0x0($9)                              \n\t"
    "gsswrc1    $f26, 0x0($8)                             \n\t"

    "dsrl       $f2, $f2, $f8                             \n\t"
    "dsrl       $f22, $f22, $f8                           \n\t"
    "dsrl       $f6, $f6, $f8                             \n\t"
    "dsrl       $f26, $f26, $f8                           \n\t"
    "daddu      $10, $11, %[iStride]                      \n\t"
    "gsswlc1    $f2, 0x3($11)                             \n\t"
    "gsswlc1    $f22, 0x3($10)                            \n\t"
    "gsswrc1    $f2, 0x0($11)                             \n\t"
    "gsswrc1    $f22, 0x0($10)                            \n\t"
    "daddu      $11, $10, %[iStride]                      \n\t"
    "daddu      $10, $11, %[iStride]                      \n\t"
    "gsswlc1    $f6, 0x3($11)                             \n\t"
    "gsswlc1    $f26, 0x3($10)                            \n\t"
    "gsswrc1    $f6, 0x0($11)                             \n\t"
    "gsswrc1    $f26, 0x0($10)                            \n\t"
    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
      "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
      "$f26", "$f28", "$f30"
  );
  RECOVER_REG;
}

void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
  __asm__ volatile(
    ".set       arch=loongson3a                 \n\t"
    "gsldlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
    "gsldlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
    "gsldlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
    "gsldrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
    "gsldrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
    "gsldrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
    "pcmpeqh    $f8, $f8, $f8                   \n\t"
    "dli        $8, 0xF                         \n\t"
    "dmtc1      $8, $f6                         \n\t"
    "psrlh      $f8, $f8, $f6                   \n\t"
    "packushb   $f8, $f8, $f8                   \n\t"

    "pminub     $f0, $f0, $f8                   \n\t"
    "pminub     $f2, $f2, $f8                   \n\t"
    "pminub     $f4, $f4, $f8                   \n\t"
    "gssdlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
    "gssdlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
    "gssdlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
    "gssdrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
    "gssdrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
    "gssdrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
    :
    : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
  );
}