ref: aaa5bcc1579d6c7177afb6cf09e178df2cb6f9b8
parent: 127749c454c1fb4b23a9e391000e255bd0521dd8
parent: 720f8dcc525c2fef52518080e6e26b353a535abf
author: Ethan Hugg <ethanhugg@gmail.com>
date: Tue Jun 17 04:33:46 EDT 2014
Merge pull request #975 from mstorsjo/cleanup-asm Clean up the aarch64 deblocking assembly
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -1,35 +1,35 @@
/*!
-* \copy
-* Copyright (c) 2013, Cisco Systems
-* All rights reserved.
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions
-* are met:
-
-* * Redistributions of source code must retain the above copyright
-* notice, this list of conditions and the following disclaimer.
-
-* * Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in
-* the documentation and/or other materials provided with the
-* distribution.
-
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-* POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
#ifdef HAVE_NEON_AARCH64
.text
@@ -36,1080 +36,1080 @@
#include "arm_arch64_common_macro.S"
#ifdef __APPLE__
-.macro MASK_MATRIX
- uabd $6.16b, $1.16b, $2.16b
- cmhi $6.16b, $4.16b, $6.16b
+.macro MASK_MATRIX
+ uabd $6.16b, $1.16b, $2.16b
+ cmhi $6.16b, $4.16b, $6.16b
- uabd $4.16b, $0.16b, $1.16b
- cmhi $4.16b, $5.16b, $4.16b
- and $6.16b, $6.16b, $4.16b
+ uabd $4.16b, $0.16b, $1.16b
+ cmhi $4.16b, $5.16b, $4.16b
+ and $6.16b, $6.16b, $4.16b
- uabd $4.16b, $3.16b, $2.16b
- cmhi $4.16b, $5.16b, $4.16b
- and $6.16b, $6.16b, $4.16b
+ uabd $4.16b, $3.16b, $2.16b
+ cmhi $4.16b, $5.16b, $4.16b
+ and $6.16b, $6.16b, $4.16b
.endm
-.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
- //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
- urhadd $8.16b, $2.16b, $3.16b
- uhadd $8.16b, $0.16b, $8.16b
- usubl $9.8h, $8.8b, $1.8b
- sqxtn $9.8b, $9.8h
- usubl2 $8.8h, $8.16b, $1.16b
- sqxtn2 $9.16b, $8.8h
- smax $8.16b, $9.16b, $5.16b
+.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
+ //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
+ urhadd $8.16b, $2.16b, $3.16b
+ uhadd $8.16b, $0.16b, $8.16b
+ usubl $9.8h, $8.8b, $1.8b
+ sqxtn $9.8b, $9.8h
+ usubl2 $8.8h, $8.16b, $1.16b
+ sqxtn2 $9.16b, $8.8h
+ smax $8.16b, $9.16b, $5.16b
//
- smin $8.16b, $8.16b, $6.16b
- uabd $9.16b, $0.16b, $2.16b
- cmhi $9.16b, $4.16b, $9.16b
- and $8.16b, $8.16b, $9.16b
- and $8.16b, $8.16b, $7.16b
- add $8.16b, $1.16b, $8.16b
- abs $9.16b, $9.16b
+ smin $8.16b, $8.16b, $6.16b
+ uabd $9.16b, $0.16b, $2.16b
+ cmhi $9.16b, $4.16b, $9.16b
+ and $8.16b, $8.16b, $9.16b
+ and $8.16b, $8.16b, $7.16b
+ add $8.16b, $1.16b, $8.16b
+ abs $9.16b, $9.16b
.endm
-.macro DIFF_LUMA_LT4_P0_Q0_1
- usubl $5.8h, $0.8b, $3.8b
- usubl $6.8h, $2.8b, $1.8b
- shl $6.8h, $6.8h, #2
- add $5.8h, $5.8h, $6.8h
- sqrshrn $4.8b, $5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_1
+ usubl $5.8h, $0.8b, $3.8b
+ usubl $6.8h, $2.8b, $1.8b
+ shl $6.8h, $6.8h, #2
+ add $5.8h, $5.8h, $6.8h
+ sqrshrn $4.8b, $5.8h, #3
.endm
-.macro DIFF_LUMA_LT4_P0_Q0_2
- usubl2 $5.8h, $0.16b, $3.16b
- usubl2 $6.8h, $2.16b, $1.16b
- shl $6.8h, $6.8h, #2
- add $5.8h, $5.8h, $6.8h
- sqrshrn2 $4.16b, $5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_2
+ usubl2 $5.8h, $0.16b, $3.16b
+ usubl2 $6.8h, $2.16b, $1.16b
+ shl $6.8h, $6.8h, #2
+ add $5.8h, $5.8h, $6.8h
+ sqrshrn2 $4.16b, $5.8h, #3
.endm
-.macro EXTRACT_DELTA_INTO_TWO_PART
- cmge $1.16b, $0.16b, #0
- and $1.16b, $0.16b, $1.16b
- sub $0.16b, $1.16b, $0.16b
+.macro EXTRACT_DELTA_INTO_TWO_PART
+ cmge $1.16b, $0.16b, #0
+ and $1.16b, $0.16b, $1.16b
+ sub $0.16b, $1.16b, $0.16b
.endm
-.macro DIFF_LUMA_EQ4_P2P1P0_1
- uaddl $8.8h, $1.8b, $2.8b
- uaddl $9.8h, $3.8b, $4.8b
- add $9.8h, $9.8h, $8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_1
+ uaddl $8.8h, $1.8b, $2.8b
+ uaddl $9.8h, $3.8b, $4.8b
+ add $9.8h, $9.8h, $8.8h
- uaddl $8.8h, $0.8b, $1.8b
- shl $8.8h, $8.8h, #1
- add $8.8h, $9.8h, $8.8h
+ uaddl $8.8h, $0.8b, $1.8b
+ shl $8.8h, $8.8h, #1
+ add $8.8h, $9.8h, $8.8h
- rshrn $0.8b, $9.8h, #2
- rshrn $7.8b, $8.8h, #3
- shl $9.8h, $9.8h, #1
- usubl $8.8h, $5.8b, $1.8b
- add $9.8h, $8.8h, $9.8h
+ rshrn $0.8b, $9.8h, #2
+ rshrn $7.8b, $8.8h, #3
+ shl $9.8h, $9.8h, #1
+ usubl $8.8h, $5.8b, $1.8b
+ add $9.8h, $8.8h, $9.8h
- uaddl $8.8h, $2.8b, $5.8b
- uaddw $8.8h, $8.8h, $2.8b
- uaddw $8.8h, $8.8h, $3.8b
+ uaddl $8.8h, $2.8b, $5.8b
+ uaddw $8.8h, $8.8h, $2.8b
+ uaddw $8.8h, $8.8h, $3.8b
- rshrn $9.8b, $9.8h, #3
- rshrn $8.8b, $8.8h, #2
- bsl $6.8b, $9.8b, $8.8b
+ rshrn $9.8b, $9.8h, #3
+ rshrn $8.8b, $8.8h, #2
+ bsl $6.8b, $9.8b, $8.8b
.endm
-.macro DIFF_LUMA_EQ4_P2P1P0_2
- uaddl2 $8.8h, $1.16b, $2.16b
- uaddl2 $9.8h, $3.16b, $4.16b
- add $9.8h, $9.8h, $8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_2
+ uaddl2 $8.8h, $1.16b, $2.16b
+ uaddl2 $9.8h, $3.16b, $4.16b
+ add $9.8h, $9.8h, $8.8h
- uaddl2 $8.8h, $0.16b, $1.16b
- shl $8.8h, $8.8h, #1
- add $8.8h, $9.8h, $8.8h
+ uaddl2 $8.8h, $0.16b, $1.16b
+ shl $8.8h, $8.8h, #1
+ add $8.8h, $9.8h, $8.8h
- rshrn2 $0.16b, $9.8h, #2
- rshrn2 $7.16b, $8.8h, #3
- shl $9.8h, $9.8h, #1
- usubl2 $8.8h, $5.16b, $1.16b
- add $9.8h, $8.8h, $9.8h
+ rshrn2 $0.16b, $9.8h, #2
+ rshrn2 $7.16b, $8.8h, #3
+ shl $9.8h, $9.8h, #1
+ usubl2 $8.8h, $5.16b, $1.16b
+ add $9.8h, $8.8h, $9.8h
- uaddl2 $8.8h, $2.16b, $5.16b
- uaddw2 $8.8h, $8.8h, $2.16b
- uaddw2 $8.8h, $8.8h, $3.16b
+ uaddl2 $8.8h, $2.16b, $5.16b
+ uaddw2 $8.8h, $8.8h, $2.16b
+ uaddw2 $8.8h, $8.8h, $3.16b
- rshrn2 $9.16b, $9.8h, #3
- rshrn2 $8.16b, $8.8h, #2
- bsl $6.16b, $9.16b, $8.16b
+ rshrn2 $9.16b, $9.8h, #3
+ rshrn2 $8.16b, $8.8h, #2
+ bsl $6.16b, $9.16b, $8.16b
.endm
-.macro DIFF_CHROMA_EQ4_P0Q0_1
- uaddl $4.8h, $0.8b, $3.8b
- shl $4.8h, $4.8h, #1
- usubl $5.8h, $1.8b, $3.8b
- add $5.8h, $5.8h, $4.8h
- rshrn $6.8b, $5.8h, #2
- usubl $5.8h, $2.8b, $0.8b
- add $5.8h, $5.8h, $4.8h
- rshrn $7.8b, $5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_1
+ uaddl $4.8h, $0.8b, $3.8b
+ shl $4.8h, $4.8h, #1
+ usubl $5.8h, $1.8b, $3.8b
+ add $5.8h, $5.8h, $4.8h
+ rshrn $6.8b, $5.8h, #2
+ usubl $5.8h, $2.8b, $0.8b
+ add $5.8h, $5.8h, $4.8h
+ rshrn $7.8b, $5.8h, #2
.endm
-.macro DIFF_CHROMA_EQ4_P0Q0_2
- uaddl2 $4.8h, $0.16b, $3.16b
- shl $4.8h, $4.8h, #1
- usubl2 $5.8h, $1.16b, $3.16b
- add $5.8h, $5.8h, $4.8h
- rshrn2 $6.16b, $5.8h, #2
- usubl2 $5.8h, $2.16b, $0.16b
- add $5.8h, $5.8h, $4.8h
- rshrn2 $7.16b, $5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_2
+ uaddl2 $4.8h, $0.16b, $3.16b
+ shl $4.8h, $4.8h, #1
+ usubl2 $5.8h, $1.16b, $3.16b
+ add $5.8h, $5.8h, $4.8h
+ rshrn2 $6.16b, $5.8h, #2
+ usubl2 $5.8h, $2.16b, $0.16b
+ add $5.8h, $5.8h, $4.8h
+ rshrn2 $7.16b, $5.8h, #2
.endm
-.macro DIFF_LUMA_EQ4_MASK
- mov.16b $3, $2
- bsl $3.16b, $0.16b, $1.16b
+.macro DIFF_LUMA_EQ4_MASK
+ mov.16b $3, $2
+ bsl $3.16b, $0.16b, $1.16b
.endm
-.macro LOAD_LUMA_DATA_3
- ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
- ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
+.macro LOAD_LUMA_DATA_3
+ ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
+ ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
.endm
-.macro LOAD_LUMA_DATA_4
- ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
- ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
+.macro LOAD_LUMA_DATA_4
+ ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
+ ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
.endm
-.macro STORE_LUMA_DATA_4
- st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
- st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
+.macro STORE_LUMA_DATA_4
+ st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
+ st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
.endm
-.macro STORE_LUMA_DATA_3
- st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
- st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
+.macro STORE_LUMA_DATA_3
+ st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
+ st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
.endm
-.macro LOAD_CHROMA_DATA_4
- ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
+.macro LOAD_CHROMA_DATA_4
+ ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
.endm
-.macro STORE_CHROMA_DATA_2
- st2 {$0.b, $1.b} [$3], [$2], x2
+.macro STORE_CHROMA_DATA_2
+ st2 {$0.b, $1.b} [$3], [$2], x2
.endm
-.macro ZERO_JUMP_END
- mov $1, $0.d[0]
- mov $2, $0.d[1]
- orr $1, $1, $2
- cbz $1, $3
+.macro ZERO_JUMP_END
+ mov $1, $0.d[0]
+ mov $2, $0.d[1]
+ orr $1, $1, $2
+ cbz $1, $3
.endm
.macro BS_NZC_CHECK
- ld1 {v0.16b}, [$0]
- //Arrange the input data --- TOP
- ands x6, $1, #2
- cbz x6, bs_nzc_check_jump0
- sub x6, $0, $2, lsl #4
- sub x6, x6, $2, lsl #3
- add x6, x6, #12
- ld1 {v1.s} [3], [x6]
+ ld1 {v0.16b}, [$0]
+ //Arrange the input data --- TOP
+ ands x6, $1, #2
+ cbz x6, bs_nzc_check_jump0
+ sub x6, $0, $2, lsl #4
+ sub x6, x6, $2, lsl #3
+ add x6, x6, #12
+ ld1 {v1.s} [3], [x6]
- bs_nzc_check_jump0:
- ext.16b v1, v1, v0, #12
- add $3.16b, v0.16b, v1.16b
+ bs_nzc_check_jump0:
+ ext.16b v1, v1, v0, #12
+ add $3.16b, v0.16b, v1.16b
- // Arrange the input data --- LEFT
- ands x6, $1, #1
- cbz x6, bs_nzc_check_jump1
+ // Arrange the input data --- LEFT
+ ands x6, $1, #1
+ cbz x6, bs_nzc_check_jump1
- sub x6, $0, #21
- add x7, x6, #4
- ld1 {v1.b} [12], [x6]
- add x6, x7, #4
- ld1 {v1.b} [13], [x7]
- add x7, x6, #4
- ld1 {v1.b} [14], [x6]
- ld1 {v1.b} [15], [x7]
+ sub x6, $0, #21
+ add x7, x6, #4
+ ld1 {v1.b} [12], [x6]
+ add x6, x7, #4
+ ld1 {v1.b} [13], [x7]
+ add x7, x6, #4
+ ld1 {v1.b} [14], [x6]
+ ld1 {v1.b} [15], [x7]
bs_nzc_check_jump1:
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ext.16b v1, v1, v0, #12
- add $4.16b, v0.16b, v1.16b
+ ins v2.d[0], v0.d[1]
+ zip1 v0.16b, v0.16b, v2.16b
+ ins v2.d[0], v0.d[1]
+ zip1 v0.16b, v0.16b, v2.16b
+ ext.16b v1, v1, v0, #12
+ add $4.16b, v0.16b, v1.16b
.endm
.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
- mov w6, #4
- sabd v20.8h, $0.8h, $1.8h
- sabd v21.8h, $1.8h, $2.8h
- dup $0.8h, w6
- sabd v22.8h, $2.8h, $3.8h
- sabd v23.8h, $3.8h, $4.8h
+ mov w6, #4
+ sabd v20.8h, $0.8h, $1.8h
+ sabd v21.8h, $1.8h, $2.8h
+ dup $0.8h, w6
+ sabd v22.8h, $2.8h, $3.8h
+ sabd v23.8h, $3.8h, $4.8h
- cmge v20.8h, v20.8h, $0.8h
- cmge v21.8h, v21.8h, $0.8h
- cmge v22.8h, v22.8h, $0.8h
- cmge v23.8h, v23.8h, $0.8h
+ cmge v20.8h, v20.8h, $0.8h
+ cmge v21.8h, v21.8h, $0.8h
+ cmge v22.8h, v22.8h, $0.8h
+ cmge v23.8h, v23.8h, $0.8h
- addp v20.8h, v20.8h, v21.8h
- addp v21.8h, v22.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v21.8h, v22.8h, v23.8h
- addhn $5.8b, v20.8h, v20.8h
- addhn2 $5.16b, v21.8h, v21.8h
+ addhn $5.8b, v20.8h, v20.8h
+ addhn2 $5.16b, v21.8h, v21.8h
.endm
.macro BS_MV_CHECK
- ldp q0, q1, [$0], #32
- ldp q2, q3, [$0]
- sub $0, $0, #32
- // Arrenge the input data --- TOP
- ands x6, $1, #2
- cbz x6, bs_mv_check_jump0
- sub x6, $0, $2, lsl #6
- add x6, x6, #48
- ld1 {v4.16b}, [x6]
+ ldp q0, q1, [$0], #32
+ ldp q2, q3, [$0]
+ sub $0, $0, #32
+ // Arrenge the input data --- TOP
+ ands x6, $1, #2
+ cbz x6, bs_mv_check_jump0
+ sub x6, $0, $2, lsl #6
+ add x6, x6, #48
+ ld1 {v4.16b}, [x6]
bs_mv_check_jump0:
- BS_COMPARE_MV v4, v0, v1, v2, v3, $3
- // Arrange the input data --- LEFT
- ands x6, $1, #1
- cbz x6, bs_mv_check_jump1
- sub x6, $0, #52
- add x7, x6, #16
- ld1 {v4.s} [0], [x6]
- add x6, x7, #16
- ld1 {v4.s} [1], [x7]
- add x7, x6, #16
- ld1 {v4.s} [2], [x6]
- ld1 {v4.s} [3], [x7]
+ BS_COMPARE_MV v4, v0, v1, v2, v3, $3
+ // Arrange the input data --- LEFT
+ ands x6, $1, #1
+ cbz x6, bs_mv_check_jump1
+ sub x6, $0, #52
+ add x7, x6, #16
+ ld1 {v4.s} [0], [x6]
+ add x6, x7, #16
+ ld1 {v4.s} [1], [x7]
+ add x7, x6, #16
+ ld1 {v4.s} [2], [x6]
+ ld1 {v4.s} [3], [x7]
bs_mv_check_jump1:
- zip1 $5.4s, v0.4s, v2.4s
- zip2 $6.4s, v0.4s, v2.4s
- zip1 v0.4s, v1.4s, v3.4s
- zip2 v2.4s, v1.4s, v3.4s
- zip2 v1.4s, $5.4s, v0.4s
- zip1 v0.4s, $5.4s, v0.4s
- zip2 v3.4s, $6.4s, v2.4s
- zip1 v2.4s, $6.4s, v2.4s
- BS_COMPARE_MV v4, v0, v1, v2, v3, $4
+ zip1 $5.4s, v0.4s, v2.4s
+ zip2 $6.4s, v0.4s, v2.4s
+ zip1 v0.4s, v1.4s, v3.4s
+ zip2 v2.4s, v1.4s, v3.4s
+ zip2 v1.4s, $5.4s, v0.4s
+ zip1 v0.4s, $5.4s, v0.4s
+ zip2 v3.4s, $6.4s, v2.4s
+ zip1 v2.4s, $6.4s, v2.4s
+ BS_COMPARE_MV v4, v0, v1, v2, v3, $4
.endm
#else
-.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
- uabd \arg6.16b, \arg1.16b, \arg2.16b
- cmhi \arg6.16b, \arg4.16b, \arg6.16b
+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
+ cmhi \arg6\().16b, \arg4\().16b, \arg6\().16b
- uabd \arg4.16b, \arg0.16b, \arg1.16b
- cmhi \arg4.16b, \arg5.16b, \arg4.16b
- and \arg6.16b, \arg6.16b, \arg4.16b
+ uabd \arg4\().16b, \arg0\().16b, \arg1\().16b
+ cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b
+ and \arg6\().16b, \arg6\().16b, \arg4\().16b
- uabd \arg4.16b, \arg3.16b, \arg2.16b
- cmhi \arg4.16b, \arg5.16b, \arg4.16b
- and \arg6.16b, \arg6.16b, \arg4.16b
+ uabd \arg4\().16b, \arg3\().16b, \arg2\().16b
+ cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b
+ and \arg6\().16b, \arg6\().16b, \arg4\().16b
.endm
-.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
- //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
- urhadd \arg8.16b, \arg2.16b, \arg3.16b
- uhadd \arg8.16b, \arg0.16b, \arg8.16b
- usubl \arg9.8h, \arg8.8b, \arg1.8b
- sqxtn \arg9.8b, \arg9.8h
- usubl2 \arg8.8h, \arg8.16b, \arg1.16b
- sqxtn2 \arg9.16b, \arg8.8h
- smax \arg8.16b, \arg9.16b, \arg5.16b
- //
- smin \arg8.16b, \arg8.16b, \arg6.16b
- uabd \arg9.16b, \arg0.16b, \arg2.16b
- cmhi \arg9.16b, \arg4.16b, \arg9.16b
- and \arg8.16b, \arg8.16b, \arg9.16b
- and \arg8.16b, \arg8.16b, \arg7.16b
- add \arg8.16b, \arg1.16b, \arg8.16b
- abs \arg9.16b, \arg9.16b
+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+ //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
+ urhadd \arg8\().16b, \arg2\().16b, \arg3\().16b
+ uhadd \arg8\().16b, \arg0\().16b, \arg8\().16b
+ usubl \arg9\().8h, \arg8\().8b, \arg1\().8b
+ sqxtn \arg9\().8b, \arg9\().8h
+ usubl2 \arg8\().8h, \arg8\().16b, \arg1\().16b
+ sqxtn2 \arg9\().16b, \arg8\().8h
+ smax \arg8\().16b, \arg9\().16b, \arg5\().16b
+ //
+ smin \arg8\().16b, \arg8\().16b, \arg6\().16b
+ uabd \arg9\().16b, \arg0\().16b, \arg2\().16b
+ cmhi \arg9\().16b, \arg4\().16b, \arg9\().16b
+ and \arg8\().16b, \arg8\().16b, \arg9\().16b
+ and \arg8\().16b, \arg8\().16b, \arg7\().16b
+ add \arg8\().16b, \arg1\().16b, \arg8\().16b
+ abs \arg9\().16b, \arg9\().16b
.endm
-.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- usubl \arg5.8h, \arg0.8b, \arg3.8b
- usubl \arg6.8h, \arg2.8b, \arg1.8b
- shl \arg6.8h, \arg6.8h, #2
- add \arg5.8h, \arg5.8h, \arg6.8h
- sqrshrn \arg4.8b, \arg5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ usubl \arg5\().8h, \arg0\().8b, \arg3\().8b
+ usubl \arg6\().8h, \arg2\().8b, \arg1\().8b
+ shl \arg6\().8h, \arg6\().8h, #2
+ add \arg5\().8h, \arg5\().8h, \arg6\().8h
+ sqrshrn \arg4\().8b, \arg5\().8h, #3
.endm
-.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- usubl2 \arg5.8h, \arg0.16b, \arg3.16b
- usubl2 \arg6.8h, \arg2.16b, \arg1.16b
- shl \arg6.8h, \arg6.8h, #2
- add \arg5.8h, \arg5.8h, \arg6.8h
- sqrshrn2 \arg4.16b, \arg5.8h, #3
+.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ usubl2 \arg5\().8h, \arg0\().16b, \arg3\().16b
+ usubl2 \arg6\().8h, \arg2\().16b, \arg1\().16b
+ shl \arg6\().8h, \arg6\().8h, #2
+ add \arg5\().8h, \arg5\().8h, \arg6\().8h
+ sqrshrn2 \arg4\().16b, \arg5\().8h, #3
.endm
-.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
- cmge \arg1.16b, \arg0.16b, #0
- and \arg1.16b, \arg0.16b, \arg1.16b
- sub \arg0.16b, \arg1.16b, \arg0.16b
+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+ cmge \arg1\().16b, \arg0\().16b, #0
+ and \arg1\().16b, \arg0\().16b, \arg1\().16b
+ sub \arg0\().16b, \arg1\().16b, \arg0\().16b
.endm
-.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
- uaddl \arg8.8h, \arg1.8b, \arg2.8b
- uaddl \arg9.8h, \arg3.8b, \arg4.8b
- add \arg9.8h, \arg9.8h, \arg8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+ uaddl \arg8\().8h, \arg1\().8b, \arg2\().8b
+ uaddl \arg9\().8h, \arg3\().8b, \arg4\().8b
+ add \arg9\().8h, \arg9\().8h, \arg8\().8h
- uaddl \arg8.8h, \arg0.8b, \arg1.8b
- shl \arg8.8h, \arg8.8h, #1
- add \arg8.8h, \arg9.8h, \arg8.8h
+ uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b
+ shl \arg8\().8h, \arg8\().8h, #1
+ add \arg8\().8h, \arg9\().8h, \arg8\().8h
- rshrn \arg0.8b, \arg9.8h, #2
- rshrn \arg7.8b, \arg8.8h, #3
- shl \arg9.8h, \arg9.8h, #1
- usubl \arg8.8h, \arg5.8b, \arg1.8b
- add \arg9.8h, \arg8.8h, \arg9.8h
+ rshrn \arg0\().8b, \arg9\().8h, #2
+ rshrn \arg7\().8b, \arg8\().8h, #3
+ shl \arg9\().8h, \arg9\().8h, #1
+ usubl \arg8\().8h, \arg5\().8b, \arg1\().8b
+ add \arg9\().8h, \arg8\().8h, \arg9\().8h
- uaddl \arg8.8h, \arg2.8b, \arg5.8b
- uaddw \arg8.8h, \arg8.8h, \arg2.8b
- uaddw \arg8.8h, \arg8.8h, \arg3.8b
+ uaddl \arg8\().8h, \arg2\().8b, \arg5\().8b
+ uaddw \arg8\().8h, \arg8\().8h, \arg2\().8b
+ uaddw \arg8\().8h, \arg8\().8h, \arg3\().8b
- rshrn \arg9.8b, \arg9.8h, #3
- rshrn \arg8.8b, \arg8.8h, #2
- bsl \arg6.8b, \arg9.8b, \arg8.8b
+ rshrn \arg9\().8b, \arg9\().8h, #3
+ rshrn \arg8\().8b, \arg8\().8h, #2
+ bsl \arg6\().8b, \arg9\().8b, \arg8\().8b
.endm
-.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
- uaddl2 \arg8.8h, \arg1.16b, \arg2.16b
- uaddl2 \arg9.8h, \arg3.16b, \arg4.16b
- add \arg9.8h, \arg9.8h, \arg8.8h
+.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+ uaddl2 \arg8\().8h, \arg1\().16b, \arg2\().16b
+ uaddl2 \arg9\().8h, \arg3\().16b, \arg4\().16b
+ add \arg9\().8h, \arg9\().8h, \arg8\().8h
- uaddl2 \arg8.8h, \arg0.16b, \arg1.16b
- shl \arg8.8h, \arg8.8h, #1
- add \arg8.8h, \arg9.8h, \arg8.8h
+ uaddl2 \arg8\().8h, \arg0\().16b, \arg1\().16b
+ shl \arg8\().8h, \arg8\().8h, #1
+ add \arg8\().8h, \arg9\().8h, \arg8\().8h
- rshrn2 \arg0.16b, \arg9.8h, #2
- rshrn2 \arg7.16b, \arg8.8h, #3
- shl \arg9.8h, \arg9.8h, #1
- usubl2 \arg8.8h, \arg5.16b, \arg1.16b
- add \arg9.8h, \arg8.8h, \arg9.8h
+ rshrn2 \arg0\().16b, \arg9\().8h, #2
+ rshrn2 \arg7\().16b, \arg8\().8h, #3
+ shl \arg9\().8h, \arg9\().8h, #1
+ usubl2 \arg8\().8h, \arg5\().16b, \arg1\().16b
+ add \arg9\().8h, \arg8\().8h, \arg9\().8h
- uaddl2 \arg8.8h, \arg2.16b, \arg5.16b
- uaddw2 \arg8.8h, \arg8.8h, \arg2.16b
- uaddw2 \arg8.8h, \arg8.8h, \arg3.16b
+ uaddl2 \arg8\().8h, \arg2\().16b, \arg5\().16b
+ uaddw2 \arg8\().8h, \arg8\().8h, \arg2\().16b
+ uaddw2 \arg8\().8h, \arg8\().8h, \arg3\().16b
- rshrn2 \arg9.16b, \arg9.8h, #3
- rshrn2 \arg8.16b, \arg8.8h, #2
- bsl \arg6.16b, \arg9.16b, \arg8.16b
+ rshrn2 \arg9\().16b, \arg9\().8h, #3
+ rshrn2 \arg8\().16b, \arg8\().8h, #2
+ bsl \arg6\().16b, \arg9\().16b, \arg8\().16b
.endm
-.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- uaddl \arg4.8h, \arg0.8b, \arg3.8b
- shl \arg4.8h, \arg4.8h, #1
- usubl \arg5.8h, \arg1.8b, \arg3.8b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn \arg6.8b, \arg5.8h, #2
- usubl \arg5.8h, \arg2.8b, \arg0.8b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn \arg7.8b, \arg5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+ uaddl \arg4\().8h, \arg0\().8b, \arg3\().8b
+ shl \arg4\().8h, \arg4\().8h, #1
+ usubl \arg5\().8h, \arg1\().8b, \arg3\().8b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn \arg6\().8b, \arg5\().8h, #2
+ usubl \arg5\().8h, \arg2\().8b, \arg0\().8b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn \arg7\().8b, \arg5\().8h, #2
.endm
-.macro DIFF_CHROMA_EQ4_P0Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
- uaddl2 \arg4.8h, \arg0.16b, \arg3.16b
- shl \arg4.8h, \arg4.8h, #1
- usubl2 \arg5.8h, \arg1.16b, \arg3.16b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn2 \arg6.16b, \arg5.8h, #2
- usubl2 \arg5.8h, \arg2.16b, \arg0.16b
- add \arg5.8h, \arg5.8h, \arg4.8h
- rshrn2 \arg7.16b, \arg5.8h, #2
+.macro DIFF_CHROMA_EQ4_P0Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+ uaddl2 \arg4\().8h, \arg0\().16b, \arg3\().16b
+ shl \arg4\().8h, \arg4\().8h, #1
+ usubl2 \arg5\().8h, \arg1\().16b, \arg3\().16b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn2 \arg6\().16b, \arg5\().8h, #2
+ usubl2 \arg5\().8h, \arg2\().16b, \arg0\().16b
+ add \arg5\().8h, \arg5\().8h, \arg4\().8h
+ rshrn2 \arg7\().16b, \arg5\().8h, #2
.endm
-.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
- mov.16b \arg3, \arg2
- bsl \arg3.16b, \arg0.16b, \arg1.16b
+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+ mov.16b \arg3, \arg2
+ bsl \arg3\().16b, \arg0\().16b, \arg1\().16b
.endm
-.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- ld3 {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x2], x1
- ld3 {\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1
+.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ ld3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x2], x1
+ ld3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
.endm
-.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
- ld4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg8], [x3], x1
- ld4 {\arg4.b, \arg5.b, \arg6.b, \arg7.b} [\arg8], [x0], x1
+.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+ ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg8], [x3], x1
+ ld4 {\arg4\().b, \arg5\().b, \arg6\().b, \arg7\().b} [\arg8], [x0], x1
.endm
-.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- st4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg4], [x0], x1
- st4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [x2], x1
+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+ st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg4], [x0], x1
+ st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [x2], x1
.endm
-.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
- st3 {\arg0.b, \arg1.b, \arg2.b} [\arg6], [x3], x1
- st3 {\arg3.b, \arg4.b, \arg5.b} [\arg6], [x0], x1
+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+ st3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x3], x1
+ st3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
.endm
-.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
- ld4 {\arg0.b, \arg1.b, \arg2.b, \arg3.b} [\arg5], [\arg4], x2
+.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+ ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [\arg4], x2
.endm
-.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
- st2 {\arg0.b, \arg1.b} [\arg3], [\arg2], x2
+.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
+ st2 {\arg0\().b, \arg1\().b} [\arg3], [\arg2], x2
.endm
-.macro ZERO_JUMP_END arg0, arg1, arg2, arg3
- mov \arg1, \arg0.d[0]
- mov \arg2, \arg0.d[1]
- orr \arg1, \arg1, \arg2
- cbz \arg1, \arg3
+.macro ZERO_JUMP_END arg0, arg1, arg2, arg3
+ mov \arg1, \arg0\().d[0]
+ mov \arg2, \arg0\().d[1]
+ orr \arg1, \arg1, \arg2
+ cbz \arg1, \arg3
.endm
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
- ld1 {v0.16b}, [\arg0]
- //Arrange the input data --- TOP
- ands x6, \arg1, #2
- cbz x6, bs_nzc_check_jump0
- sub x6, \arg0, \arg2, lsl #4
- sub x6, x6, \arg2, lsl #3
- add x6, x6, #12
- ld1 {v1.s} [3], [x6]
+ ld1 {v0.16b}, [\arg0]
+ //Arrange the input data --- TOP
+ ands x6, \arg1, #2
+ cbz x6, bs_nzc_check_jump0
+ sub x6, \arg0, \arg2, lsl #4
+ sub x6, x6, \arg2, lsl #3
+ add x6, x6, #12
+ ld1 {v1.s} [3], [x6]
bs_nzc_check_jump0:
- ext.16b v1, v1, v0, #12
- add \arg3.16b, v0.16b, v1.16b
+ ext.16b v1, v1, v0, #12
+ add \arg3\().16b, v0.16b, v1.16b
- // Arrange the input data --- LEFT
- ands x6, \arg1, #1
- cbz x6, bs_nzc_check_jump1
+ // Arrange the input data --- LEFT
+ ands x6, \arg1, #1
+ cbz x6, bs_nzc_check_jump1
- sub x6, \arg0, #21
- add x7, x6, #4
- ld1 {v1.b} [12], [x6]
- add x6, x7, #4
- ld1 {v1.b} [13], [x7]
- add x7, x6, #4
- ld1 {v1.b} [14], [x6]
- ld1 {v1.b} [15], [x7]
+ sub x6, \arg0, #21
+ add x7, x6, #4
+ ld1 {v1.b} [12], [x6]
+ add x6, x7, #4
+ ld1 {v1.b} [13], [x7]
+ add x7, x6, #4
+ ld1 {v1.b} [14], [x6]
+ ld1 {v1.b} [15], [x7]
bs_nzc_check_jump1:
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ext.16b v1, v1, v0, #12
- add \arg4.16b, v0.16b, v1.16b
+ ins v2.d[0], v0.d[1]
+ zip1 v0.16b, v0.16b, v2.16b
+ ins v2.d[0], v0.d[1]
+ zip1 v0.16b, v0.16b, v2.16b
+ ext.16b v1, v1, v0, #12
+ add \arg4\().16b, v0.16b, v1.16b
.endm
.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5
- //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
- mov w6, #4
- sabd v20.8h, \arg0.8h, \arg1.8h
- sabd v21.8h, \arg1.8h, \arg2.8h
- dup \arg0.8h, w6
- sabd v22.8h, \arg2.8h, \arg3.8h
- sabd v23.8h, \arg3.8h, \arg4.8h
+ //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
+ mov w6, #4
+ sabd v20.8h, \arg0\().8h, \arg1\().8h
+ sabd v21.8h, \arg1\().8h, \arg2\().8h
+ dup \arg0\().8h, w6
+ sabd v22.8h, \arg2\().8h, \arg3\().8h
+ sabd v23.8h, \arg3\().8h, \arg4\().8h
- cmge v20.8h, v20.8h, \arg0.8h
- cmge v21.8h, v21.8h, \arg0.8h
- cmge v22.8h, v22.8h, \arg0.8h
- cmge v23.8h, v23.8h, \arg0.8h
+ cmge v20.8h, v20.8h, \arg0\().8h
+ cmge v21.8h, v21.8h, \arg0\().8h
+ cmge v22.8h, v22.8h, \arg0\().8h
+ cmge v23.8h, v23.8h, \arg0\().8h
- addp v20.8h, v20.8h, v21.8h
- addp v21.8h, v22.8h, v23.8h
+ addp v20.8h, v20.8h, v21.8h
+ addp v21.8h, v22.8h, v23.8h
- addhn \arg5.8b, v20.8h, v20.8h
- addhn2 \arg5.16b, v21.8h, v21.8h
+ addhn \arg5\().8b, v20.8h, v20.8h
+ addhn2 \arg5\().16b, v21.8h, v21.8h
.endm
.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
- ldp q0, q1, [\arg0], #32
- ldp q2, q3, [\arg0]
- sub \arg0, \arg0, #32
- // Arrenge the input data --- TOP
- ands x6, \arg1, #2
- cbz x6, bs_mv_check_jump0
- sub x6, \arg0, \arg2, lsl #6
- add x6, x6, #48
- ld1 {v4.16b}, [x6]
+ ldp q0, q1, [\arg0], #32
+ ldp q2, q3, [\arg0]
+ sub \arg0, \arg0, #32
+ // Arrenge the input data --- TOP
+ ands x6, \arg1, #2
+ cbz x6, bs_mv_check_jump0
+ sub x6, \arg0, \arg2, lsl #6
+ add x6, x6, #48
+ ld1 {v4.16b}, [x6]
bs_mv_check_jump0:
- BS_COMPARE_MV v4, v0, v1, v2, v3, \arg3
- // Arrange the input data --- LEFT
- ands x6, \arg1, #1
- cbz x6, bs_mv_check_jump1
- sub x6, \arg0, #52
- add x7, x6, #16
- ld1 {v4.s} [0], [x6]
- add x6, x7, #16
- ld1 {v4.s} [1], [x7]
- add x7, x6, #16
- ld1 {v4.s} [2], [x6]
- ld1 {v4.s} [3], [x7]
+ BS_COMPARE_MV v4, v0, v1, v2, v3, \arg3
+ // Arrange the input data --- LEFT
+ ands x6, \arg1, #1
+ cbz x6, bs_mv_check_jump1
+ sub x6, \arg0, #52
+ add x7, x6, #16
+ ld1 {v4.s} [0], [x6]
+ add x6, x7, #16
+ ld1 {v4.s} [1], [x7]
+ add x7, x6, #16
+ ld1 {v4.s} [2], [x6]
+ ld1 {v4.s} [3], [x7]
bs_mv_check_jump1:
- zip1 \arg5.4s, v0.4s, v2.4s
- zip2 \arg6.4s, v0.4s, v2.4s
- zip1 v0.4s, v1.4s, v3.4s
- zip2 v2.4s, v1.4s, v3.4s
- zip2 v1.4s, \arg5.4s, v0.4s
- zip1 v0.4s, \arg5.4s, v0.4s
- zip2 v3.4s, \arg6.4s, v2.4s
- zip1 v2.4s, \arg6.4s, v2.4s
- BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
+ zip1 \arg5\().4s, v0.4s, v2.4s
+ zip2 \arg6\().4s, v0.4s, v2.4s
+ zip1 v0.4s, v1.4s, v3.4s
+ zip2 v2.4s, v1.4s, v3.4s
+ zip2 v1.4s, \arg5\().4s, v0.4s
+ zip1 v0.4s, \arg5\().4s, v0.4s
+ zip2 v3.4s, \arg6\().4s, v2.4s
+ zip1 v2.4s, \arg6\().4s, v2.4s
+ BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
.endm
#endif
WELS_ASM_ARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
- ld1 {v0.8b, v1.8b, v2.8b}, [x0]
- ins v0.d[1], v1.d[0]
- uzp1 v0.2d, v0.2d, v1.2d
- cmeq v0.16b, v0.16b, #0
- cmeq v2.8b, v2.8b, #0
- mvn v0.16b, v0.16b
- mvn v2.8b, v2.8b
- abs v0.16b, v0.16b
- abs v2.8b, v2.8b
- ins v1.d[0], v0.d[1]
- st1 {v0.8b, v1.8b, v2.8b}, [x0]
+ ld1 {v0.8b, v1.8b, v2.8b}, [x0]
+ ins v0.d[1], v1.d[0]
+ uzp1 v0.2d, v0.2d, v1.2d
+ cmeq v0.16b, v0.16b, #0
+ cmeq v2.8b, v2.8b, #0
+ mvn v0.16b, v0.16b
+ mvn v2.8b, v2.8b
+ abs v0.16b, v0.16b
+ abs v2.8b, v2.8b
+ ins v1.d[0], v0.d[1]
+ st1 {v0.8b, v1.8b, v2.8b}, [x0]
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
- dup v16.16b, w2 //alpha
- dup v17.16b, w3 //beta
- add x2, x1, x1, lsl #1
- sub x2, x0, x2
- movi v23.16b, #128
- ld1 {v0.16b}, [x2], x1
- ld1 {v1.16b}, [x2], x1
- ld1 {v2.16b}, [x2]
- ld1 {v3.16b}, [x0], x1
- ld1 {v4.16b}, [x0], x1
- ld1 {v5.16b}, [x0]
- sub x2, x2, x1
- ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
- trn1 v18.2s, v18.2s, v19.2s
- trn1 v20.2s, v20.2s, v21.2s
- trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
- cmge v7.16b, v6.16b, #0 // iTc0 Flag
+ dup v16.16b, w2 //alpha
+ dup v17.16b, w3 //beta
+ add x2, x1, x1, lsl #1
+ sub x2, x0, x2
+ movi v23.16b, #128
+ ld1 {v0.16b}, [x2], x1
+ ld1 {v1.16b}, [x2], x1
+ ld1 {v2.16b}, [x2]
+ ld1 {v3.16b}, [x0], x1
+ ld1 {v4.16b}, [x0], x1
+ ld1 {v5.16b}, [x0]
+ sub x2, x2, x1
+ ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
+ trn1 v18.2s, v18.2s, v19.2s
+ trn1 v20.2s, v20.2s, v21.2s
+ trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
+ cmge v7.16b, v6.16b, #0 // iTc0 Flag
- MASK_MATRIX v1, v2, v3, v4, v16, v17, v18
- and v7.16b, v7.16b, v18.16b // need filter flag
+ MASK_MATRIX v1, v2, v3, v4, v16, v17, v18
+ and v7.16b, v7.16b, v18.16b // need filter flag
- ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end
+ ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end
- eor v18.16b, v18.16b, v18.16b
- sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
+ eor v18.16b, v18.16b, v18.16b
+ sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
- DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20
- st1 {v19.16b}, [x2], x1
+ DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20
+ st1 {v19.16b}, [x2], x1
- DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22
+ DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22
- abs v20.16b, v20.16b
- abs v22.16b, v22.16b
- add v6.16b, v6.16b, v20.16b
- add v6.16b, v6.16b, v22.16b
- eor v18.16b, v18.16b, v18.16b
- sub v18.16b, v18.16b, v6.16b
+ abs v20.16b, v20.16b
+ abs v22.16b, v22.16b
+ add v6.16b, v6.16b, v20.16b
+ add v6.16b, v6.16b, v22.16b
+ eor v18.16b, v18.16b, v18.16b
+ sub v18.16b, v18.16b, v6.16b
- DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
- DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
- smax v19.16b, v19.16b, v18.16b
- smin v19.16b, v19.16b, v6.16b
- and v19.16b, v19.16b, v7.16b
+ smax v19.16b, v19.16b, v18.16b
+ smin v19.16b, v19.16b, v6.16b
+ and v19.16b, v19.16b, v7.16b
- EXTRACT_DELTA_INTO_TWO_PART v19, v20
- uqadd v2.16b, v2.16b, v20.16b
- uqsub v2.16b, v2.16b, v19.16b
- st1 {v2.16b}, [x2], x1
- uqsub v3.16b, v3.16b, v20.16b
- uqadd v3.16b, v3.16b, v19.16b
- st1 {v3.16b}, [x2], x1
- st1 {v21.16b}, [x2]
+ EXTRACT_DELTA_INTO_TWO_PART v19, v20
+ uqadd v2.16b, v2.16b, v20.16b
+ uqsub v2.16b, v2.16b, v19.16b
+ st1 {v2.16b}, [x2], x1
+ uqsub v3.16b, v3.16b, v20.16b
+ uqadd v3.16b, v3.16b, v19.16b
+ st1 {v3.16b}, [x2], x1
+ st1 {v21.16b}, [x2]
DeblockLumaLt4V_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon
- dup v16.16b, w2 //alpha
- dup v17.16b, w3 //beta
- sub x3, x0, x1, lsl #2
+ dup v16.16b, w2 //alpha
+ dup v17.16b, w3 //beta
+ sub x3, x0, x1, lsl #2
- ld1 {v0.16b}, [x3], x1
- ld1 {v4.16b}, [x0], x1
- ld1 {v1.16b}, [x3], x1
- ld1 {v5.16b}, [x0], x1
- ld1 {v2.16b}, [x3], x1
- ld1 {v6.16b}, [x0], x1
- ld1 {v3.16b}, [x3]
- ld1 {v7.16b}, [x0]
+ ld1 {v0.16b}, [x3], x1
+ ld1 {v4.16b}, [x0], x1
+ ld1 {v1.16b}, [x3], x1
+ ld1 {v5.16b}, [x0], x1
+ ld1 {v2.16b}, [x3], x1
+ ld1 {v6.16b}, [x0], x1
+ ld1 {v3.16b}, [x3]
+ ld1 {v7.16b}, [x0]
- sub x3, x3, x1, lsl #1
- MASK_MATRIX v2, v3, v4, v5, v16, v17, v18
- lsr w2, w2, #2
- add w2, w2, #2
- dup v16.16b, w2 //((alpha >> 2) + 2)
- uabd v19.16b, v3.16b, v4.16b
- cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
+ sub x3, x3, x1, lsl #1
+ MASK_MATRIX v2, v3, v4, v5, v16, v17, v18
+ lsr w2, w2, #2
+ add w2, w2, #2
+ dup v16.16b, w2 //((alpha >> 2) + 2)
+ uabd v19.16b, v3.16b, v4.16b
+ cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
- uabd v21.16b, v1.16b, v3.16b
- cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0
- and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
+ uabd v21.16b, v1.16b, v3.16b
+ cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0
+ and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
- uabd v22.16b, v6.16b, v4.16b
- cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0
- and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
- and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
+ uabd v22.16b, v6.16b, v4.16b
+ cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0
+ and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
+ and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
- mov.16b v23, v21
- mov.16b v24, v21
+ mov.16b v23, v21
+ mov.16b v24, v21
- mov.16b v25, v0
- DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
- DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
- ins v0.d[1], v25.d[1]
- ins v23.d[1], v24.d[1]
- and v21.16b, v20.16b, v21.16b
- DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
- st1 {v17.16b}, [x3], x1
- DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
- st1 {v17.16b}, [x3], x1
- DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
- st1 {v17.16b}, [x3], x1
+ mov.16b v25, v0
+ DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
+ DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
+ ins v0.d[1], v25.d[1]
+ ins v23.d[1], v24.d[1]
+ and v21.16b, v20.16b, v21.16b
+ DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
+ st1 {v17.16b}, [x3], x1
+ DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
+ st1 {v17.16b}, [x3], x1
+ DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
+ st1 {v17.16b}, [x3], x1
- mov.16b v23, v22
- mov.16b v24, v22
- mov.16b v25, v7
- DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
- DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
- ins v7.d[1], v25.d[1]
- ins v23.d[1], v24.d[1]
- and v22.16b, v20.16b, v22.16b
- DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
- st1 {v17.16b}, [x3], x1
- DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
- st1 {v17.16b}, [x3], x1
- DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
- st1 {v17.16b}, [x3], x1
+ mov.16b v23, v22
+ mov.16b v24, v22
+ mov.16b v25, v7
+ DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
+ DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
+ ins v7.d[1], v25.d[1]
+ ins v23.d[1], v24.d[1]
+ and v22.16b, v20.16b, v22.16b
+ DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
+ st1 {v17.16b}, [x3], x1
+ DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
+ st1 {v17.16b}, [x3], x1
+ DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
+ st1 {v17.16b}, [x3], x1
DeblockLumaEq4V_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
- dup v16.16b, w2 //alpha
- dup v17.16b, w3 //beta
- sub x2, x0, #3
- movi v23.16b, #128
+ dup v16.16b, w2 //alpha
+ dup v17.16b, w3 //beta
+ sub x2, x0, #3
+ movi v23.16b, #128
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 0
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 1
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 2
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 3
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 4
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 5
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 6
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 7
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 0
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 1
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 2
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 3
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 4
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 5
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 6
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 7
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 8
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 9
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 10
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 11
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 12
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 13
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 14
- LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 15
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 8
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 9
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 10
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 11
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 12
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 13
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 14
+ LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 15
- sub x0, x0, x1, lsl #4
+ sub x0, x0, x1, lsl #4
- ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
- trn1 v18.2s, v18.2s, v19.2s
- trn1 v20.2s, v20.2s, v21.2s
- trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
- cmge v7.16b, v6.16b, #0 // iTc0 Flag
+ ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
+ trn1 v18.2s, v18.2s, v19.2s
+ trn1 v20.2s, v20.2s, v21.2s
+ trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
+ cmge v7.16b, v6.16b, #0 // iTc0 Flag
- MASK_MATRIX v1, v2, v3, v4, v16, v17, v18
- and v7.16b, v7.16b, v18.16b // need filter flag
+ MASK_MATRIX v1, v2, v3, v4, v16, v17, v18
+ and v7.16b, v7.16b, v18.16b // need filter flag
- ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end
+ ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end
- eor v18.16b, v18.16b, v18.16b
- sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
+ eor v18.16b, v18.16b, v18.16b
+ sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
- DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
- mov.16b v25, v19
+ DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
+ mov.16b v25, v19
- DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
+ DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
- abs v20.16b, v20.16b
- abs v22.16b, v22.16b
- add v6.16b, v6.16b, v20.16b
- add v6.16b, v6.16b, v22.16b
- eor v18.16b, v18.16b, v18.16b
- sub v18.16b, v18.16b, v6.16b
+ abs v20.16b, v20.16b
+ abs v22.16b, v22.16b
+ add v6.16b, v6.16b, v20.16b
+ add v6.16b, v6.16b, v22.16b
+ eor v18.16b, v18.16b, v18.16b
+ sub v18.16b, v18.16b, v6.16b
- DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
- DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
- smax v19.16b, v19.16b, v18.16b
- smin v19.16b, v19.16b, v6.16b
- and v19.16b, v19.16b, v7.16b
+ smax v19.16b, v19.16b, v18.16b
+ smin v19.16b, v19.16b, v6.16b
+ and v19.16b, v19.16b, v7.16b
- EXTRACT_DELTA_INTO_TWO_PART v19, v20
- uqadd v2.16b, v2.16b, v20.16b
- uqsub v2.16b, v2.16b, v19.16b
- mov.16b v26, v2
- uqsub v3.16b, v3.16b, v20.16b
- uqadd v3.16b, v3.16b, v19.16b
- mov.16b v27, v3
- mov.16b v28, v21
+ EXTRACT_DELTA_INTO_TWO_PART v19, v20
+ uqadd v2.16b, v2.16b, v20.16b
+ uqsub v2.16b, v2.16b, v19.16b
+ mov.16b v26, v2
+ uqsub v3.16b, v3.16b, v20.16b
+ uqadd v3.16b, v3.16b, v19.16b
+ mov.16b v27, v3
+ mov.16b v28, v21
- sub x0, x0, #2
- add x2, x0, x1
- lsl x1, x1, #1
+ sub x0, x0, #2
+ add x2, x0, x1
+ lsl x1, x1, #1
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 0, 1
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 2, 3
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 4, 5
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 6, 7
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 0, 1
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 2, 3
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 4, 5
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 6, 7
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 8, 9
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 10, 11
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 12, 13
- STORE_LUMA_DATA_4 v25, v26, v27, v28, 14, 15
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 8, 9
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 10, 11
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 12, 13
+ STORE_LUMA_DATA_4 v25, v26, v27, v28, 14, 15
DeblockLumaLt4H_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon
- dup v16.16b, w2 //alpha
- dup v17.16b, w3 //beta
- sub x3, x0, #4
+ dup v16.16b, w2 //alpha
+ dup v17.16b, w3 //beta
+ sub x3, x0, #4
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 0
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 1
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 2
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 3
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 4
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 5
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 6
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 7
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 0
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 1
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 2
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 3
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 4
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 5
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 6
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 7
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 8
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 9
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 10
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 11
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 12
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 13
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 14
- LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 15
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 8
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 9
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 10
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 11
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 12
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 13
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 14
+ LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 15
- sub x0, x0, x1, lsl #4
- sub x3, x0, #3
- MASK_MATRIX v2, v3, v4, v5, v16, v17, v18
+ sub x0, x0, x1, lsl #4
+ sub x3, x0, #3
+ MASK_MATRIX v2, v3, v4, v5, v16, v17, v18
- ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end
+ ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end
- lsr w2, w2, #2
- add w2, w2, #2
- dup v16.16b, w2 //((alpha >> 2) + 2)
- uabd v19.16b, v3.16b, v4.16b
- cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
+ lsr w2, w2, #2
+ add w2, w2, #2
+ dup v16.16b, w2 //((alpha >> 2) + 2)
+ uabd v19.16b, v3.16b, v4.16b
+ cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
- uabd v21.16b, v1.16b, v3.16b
- cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0
- and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
+ uabd v21.16b, v1.16b, v3.16b
+ cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0
+ and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
- uabd v22.16b, v6.16b, v4.16b
- cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0
- and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
- and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
+ uabd v22.16b, v6.16b, v4.16b
+ cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0
+ and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
+ and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
- mov.16b v23, v21
- mov.16b v24, v21
+ mov.16b v23, v21
+ mov.16b v24, v21
- mov.16b v25, v0
- DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
- DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
- ins v0.d[1], v25.d[1]
- ins v23.d[1], v24.d[1]
- and v21.16b, v20.16b, v21.16b
- DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
- mov.16b v26, v17
- DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
- mov.16b v27, v17
- DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
- mov.16b v28, v17
+ mov.16b v25, v0
+ DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
+ DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
+ ins v0.d[1], v25.d[1]
+ ins v23.d[1], v24.d[1]
+ and v21.16b, v20.16b, v21.16b
+ DIFF_LUMA_EQ4_MASK v19, v1, v21, v17
+ mov.16b v26, v17
+ DIFF_LUMA_EQ4_MASK v0, v2, v21, v17
+ mov.16b v27, v17
+ DIFF_LUMA_EQ4_MASK v23, v3, v18, v17
+ mov.16b v28, v17
- mov.16b v23, v22
- mov.16b v24, v22
- mov.16b v25, v7
- DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
- DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
- ins v7.d[1], v25.d[1]
- ins v23.d[1], v24.d[1]
- and v22.16b, v20.16b, v22.16b
- DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
- mov.16b v29, v17
- DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
- mov.16b v30, v17
- DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
- mov.16b v31, v17
+ mov.16b v23, v22
+ mov.16b v24, v22
+ mov.16b v25, v7
+ DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
+ DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
+ ins v7.d[1], v25.d[1]
+ ins v23.d[1], v24.d[1]
+ and v22.16b, v20.16b, v22.16b
+ DIFF_LUMA_EQ4_MASK v23, v4, v18, v17
+ mov.16b v29, v17
+ DIFF_LUMA_EQ4_MASK v7, v5, v22, v17
+ mov.16b v30, v17
+ DIFF_LUMA_EQ4_MASK v19, v6, v22, v17
+ mov.16b v31, v17
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 0
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 1
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 2
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 3
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 4
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 5
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 6
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 7
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 8
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 9
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 10
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 11
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 12
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 13
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 14
- STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 15
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 0
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 1
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 2
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 3
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 4
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 5
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 6
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 7
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 8
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 9
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 10
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 11
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 12
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 13
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 14
+ STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 15
DeblockLumaEq4H_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
- dup v16.16b, w3 //alpha
- dup v17.16b, w4 //beta
- lsl x3, x2, #1
- sub x6, x0, x3 //pPixCb-2*Stride
- sub x7, x1, x3 //pPixCr-2*Stride
+ dup v16.16b, w3 //alpha
+ dup v17.16b, w4 //beta
+ lsl x3, x2, #1
+ sub x6, x0, x3 //pPixCb-2*Stride
+ sub x7, x1, x3 //pPixCr-2*Stride
- ld1 {v0.d} [0], [x6], x2
- ld1 {v1.d} [0], [x6]
- ld1 {v2.d} [0], [x0], x2
- ld1 {v3.d} [0], [x0]
- ld1 {v0.d} [1], [x7], x2
- ld1 {v1.d} [1], [x7]
- ld1 {v2.d} [1], [x1], x2
- ld1 {v3.d} [1], [x1]
+ ld1 {v0.d} [0], [x6], x2
+ ld1 {v1.d} [0], [x6]
+ ld1 {v2.d} [0], [x0], x2
+ ld1 {v3.d} [0], [x0]
+ ld1 {v0.d} [1], [x7], x2
+ ld1 {v1.d} [1], [x7]
+ ld1 {v2.d} [1], [x1], x2
+ ld1 {v3.d} [1], [x1]
- ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
- trn1 v18.4h, v18.4h, v19.4h //0011,0011,
- trn1 v20.4h, v20.4h, v21.4h //2233,2233
- zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
- cmgt v7.16b, v6.16b, #0 // iTc0 Flag
+ ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
+ trn1 v18.4h, v18.4h, v19.4h //0011,0011,
+ trn1 v20.4h, v20.4h, v21.4h //2233,2233
+ zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
+ cmgt v7.16b, v6.16b, #0 // iTc0 Flag
- MASK_MATRIX v0, v1, v2, v3, v16, v17, v18
- and v7.16b, v7.16b, v18.16b // need filter flag
+ MASK_MATRIX v0, v1, v2, v3, v16, v17, v18
+ and v7.16b, v7.16b, v18.16b // need filter flag
- ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end
+ ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end
- eor v18.16b, v18.16b, v18.16b
- sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
+ eor v18.16b, v18.16b, v18.16b
+ sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
- DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
- DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
- smax v19.16b, v19.16b, v18.16b
- smin v19.16b, v19.16b, v6.16b
- and v19.16b, v19.16b, v7.16b
+ smax v19.16b, v19.16b, v18.16b
+ smin v19.16b, v19.16b, v6.16b
+ and v19.16b, v19.16b, v7.16b
- EXTRACT_DELTA_INTO_TWO_PART v19, v20
- uqadd v1.16b, v1.16b, v20.16b
- uqsub v1.16b, v1.16b, v19.16b
- st1 {v1.d} [0], [x6], x2
- st1 {v1.d} [1], [x7], x2
- uqsub v2.16b, v2.16b, v20.16b
- uqadd v2.16b, v2.16b, v19.16b
- st1 {v2.d} [0], [x6]
- st1 {v2.d} [1], [x7]
+ EXTRACT_DELTA_INTO_TWO_PART v19, v20
+ uqadd v1.16b, v1.16b, v20.16b
+ uqsub v1.16b, v1.16b, v19.16b
+ st1 {v1.d} [0], [x6], x2
+ st1 {v1.d} [1], [x7], x2
+ uqsub v2.16b, v2.16b, v20.16b
+ uqadd v2.16b, v2.16b, v19.16b
+ st1 {v2.d} [0], [x6]
+ st1 {v2.d} [1], [x7]
DeblockChromaLt4V_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
- dup v16.16b, w3 //alpha
- dup v17.16b, w4 //beta
- sub x6, x0, #2 //pPixCb-2
- sub x7, x1, #2 //pPixCr-2
+ dup v16.16b, w3 //alpha
+ dup v17.16b, w4 //beta
+ sub x6, x0, #2 //pPixCb-2
+ sub x7, x1, #2 //pPixCr-2
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15
- sub x0, x0, #1
- sub x1, x1, #1
+ sub x0, x0, #1
+ sub x1, x1, #1
- ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
- trn1 v18.4h, v18.4h, v19.4h //0011,0011,
- trn1 v20.4h, v20.4h, v21.4h //2233,2233
- zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
- cmgt v7.16b, v6.16b, #0 // iTc0 Flag
+ ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
+ trn1 v18.4h, v18.4h, v19.4h //0011,0011,
+ trn1 v20.4h, v20.4h, v21.4h //2233,2233
+ zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
+ cmgt v7.16b, v6.16b, #0 // iTc0 Flag
- MASK_MATRIX v0, v1, v2, v3, v16, v17, v18
- and v7.16b, v7.16b, v18.16b // need filter flag
+ MASK_MATRIX v0, v1, v2, v3, v16, v17, v18
+ and v7.16b, v7.16b, v18.16b // need filter flag
- ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end
- eor v18.16b, v18.16b, v18.16b
- sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
+ ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end
+ eor v18.16b, v18.16b, v18.16b
+ sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
- DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
- DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
+ DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
- smax v19.16b, v19.16b, v18.16b
- smin v19.16b, v19.16b, v6.16b
- and v19.16b, v19.16b, v7.16b
+ smax v19.16b, v19.16b, v18.16b
+ smin v19.16b, v19.16b, v6.16b
+ and v19.16b, v19.16b, v7.16b
- EXTRACT_DELTA_INTO_TWO_PART v19, v20
- uqadd v1.16b, v1.16b, v20.16b
- uqsub v1.16b, v1.16b, v19.16b
- uqsub v2.16b, v2.16b, v20.16b
- uqadd v2.16b, v2.16b, v19.16b
+ EXTRACT_DELTA_INTO_TWO_PART v19, v20
+ uqadd v1.16b, v1.16b, v20.16b
+ uqsub v1.16b, v1.16b, v19.16b
+ uqsub v2.16b, v2.16b, v20.16b
+ uqadd v2.16b, v2.16b, v19.16b
- STORE_CHROMA_DATA_2 v1, v2, x0, 0
- STORE_CHROMA_DATA_2 v1, v2, x0, 1
- STORE_CHROMA_DATA_2 v1, v2, x0, 2
- STORE_CHROMA_DATA_2 v1, v2, x0, 3
- STORE_CHROMA_DATA_2 v1, v2, x0, 4
- STORE_CHROMA_DATA_2 v1, v2, x0, 5
- STORE_CHROMA_DATA_2 v1, v2, x0, 6
- STORE_CHROMA_DATA_2 v1, v2, x0, 7
+ STORE_CHROMA_DATA_2 v1, v2, x0, 0
+ STORE_CHROMA_DATA_2 v1, v2, x0, 1
+ STORE_CHROMA_DATA_2 v1, v2, x0, 2
+ STORE_CHROMA_DATA_2 v1, v2, x0, 3
+ STORE_CHROMA_DATA_2 v1, v2, x0, 4
+ STORE_CHROMA_DATA_2 v1, v2, x0, 5
+ STORE_CHROMA_DATA_2 v1, v2, x0, 6
+ STORE_CHROMA_DATA_2 v1, v2, x0, 7
- STORE_CHROMA_DATA_2 v1, v2, x1, 8
- STORE_CHROMA_DATA_2 v1, v2, x1, 9
- STORE_CHROMA_DATA_2 v1, v2, x1, 10
- STORE_CHROMA_DATA_2 v1, v2, x1, 11
- STORE_CHROMA_DATA_2 v1, v2, x1, 12
- STORE_CHROMA_DATA_2 v1, v2, x1, 13
- STORE_CHROMA_DATA_2 v1, v2, x1, 14
- STORE_CHROMA_DATA_2 v1, v2, x1, 15
+ STORE_CHROMA_DATA_2 v1, v2, x1, 8
+ STORE_CHROMA_DATA_2 v1, v2, x1, 9
+ STORE_CHROMA_DATA_2 v1, v2, x1, 10
+ STORE_CHROMA_DATA_2 v1, v2, x1, 11
+ STORE_CHROMA_DATA_2 v1, v2, x1, 12
+ STORE_CHROMA_DATA_2 v1, v2, x1, 13
+ STORE_CHROMA_DATA_2 v1, v2, x1, 14
+ STORE_CHROMA_DATA_2 v1, v2, x1, 15
DeblockChromaLt4H_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
- dup v16.16b, w3 //alpha
- dup v17.16b, w4 //beta
- lsl x3, x2, #1
- sub x6, x0, x3 //pPixCb-2*Stride
- sub x7, x1, x3 //pPixCr-2*Stride
+ dup v16.16b, w3 //alpha
+ dup v17.16b, w4 //beta
+ lsl x3, x2, #1
+ sub x6, x0, x3 //pPixCb-2*Stride
+ sub x7, x1, x3 //pPixCr-2*Stride
- ld1 {v0.d} [0], [x6], x2
- ld1 {v1.d} [0], [x6]
- ld1 {v2.d} [0], [x0], x2
- ld1 {v3.d} [0], [x0]
- ld1 {v0.d} [1], [x7], x2
- ld1 {v1.d} [1], [x7]
- ld1 {v2.d} [1], [x1], x2
- ld1 {v3.d} [1], [x1]
+ ld1 {v0.d} [0], [x6], x2
+ ld1 {v1.d} [0], [x6]
+ ld1 {v2.d} [0], [x0], x2
+ ld1 {v3.d} [0], [x0]
+ ld1 {v0.d} [1], [x7], x2
+ ld1 {v1.d} [1], [x7]
+ ld1 {v2.d} [1], [x1], x2
+ ld1 {v3.d} [1], [x1]
- MASK_MATRIX v0, v1, v2, v3, v16, v17, v7
+ MASK_MATRIX v0, v1, v2, v3, v16, v17, v7
- ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end
+ ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end
- DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
- DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
+ DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
+ DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
- mov.16b v6, v7
- bsl v6.16b, v20.16b, v1.16b
- bsl v7.16b, v21.16b, v2.16b
+ mov.16b v6, v7
+ bsl v6.16b, v20.16b, v1.16b
+ bsl v7.16b, v21.16b, v2.16b
- st1 {v6.d} [0], [x6], x2
- st1 {v6.d} [1], [x7], x2
+ st1 {v6.d} [0], [x6], x2
+ st1 {v6.d} [1], [x7], x2
- st1 {v7.d} [0], [x6]
- st1 {v7.d} [1], [x7]
+ st1 {v7.d} [0], [x6]
+ st1 {v7.d} [1], [x7]
DeblockChromaEq4V_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
- dup v16.16b, w3 //alpha
- dup v17.16b, w4 //beta
+ dup v16.16b, w3 //alpha
+ dup v17.16b, w4 //beta
- sub x6, x0, #2 //pPixCb-2
- sub x7, x1, #2 //pPixCr-2
+ sub x6, x0, #2 //pPixCb-2
+ sub x7, x1, #2 //pPixCr-2
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14
- LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15
- sub x0, x0, #1
- sub x1, x1, #1
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14
+ LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15
+ sub x0, x0, #1
+ sub x1, x1, #1
- MASK_MATRIX v0, v1, v2, v3, v16, v17, v7
+ MASK_MATRIX v0, v1, v2, v3, v16, v17, v7
- ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end
+ ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end
- DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
- DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
+ DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
+ DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
- mov.16b v6, v7
- bsl v6.16b, v20.16b, v1.16b
- bsl v7.16b, v21.16b, v2.16b
+ mov.16b v6, v7
+ bsl v6.16b, v20.16b, v1.16b
+ bsl v7.16b, v21.16b, v2.16b
- STORE_CHROMA_DATA_2 v6, v7, x0, 0
- STORE_CHROMA_DATA_2 v6, v7, x0, 1
- STORE_CHROMA_DATA_2 v6, v7, x0, 2
- STORE_CHROMA_DATA_2 v6, v7, x0, 3
- STORE_CHROMA_DATA_2 v6, v7, x0, 4
- STORE_CHROMA_DATA_2 v6, v7, x0, 5
- STORE_CHROMA_DATA_2 v6, v7, x0, 6
- STORE_CHROMA_DATA_2 v6, v7, x0, 7
+ STORE_CHROMA_DATA_2 v6, v7, x0, 0
+ STORE_CHROMA_DATA_2 v6, v7, x0, 1
+ STORE_CHROMA_DATA_2 v6, v7, x0, 2
+ STORE_CHROMA_DATA_2 v6, v7, x0, 3
+ STORE_CHROMA_DATA_2 v6, v7, x0, 4
+ STORE_CHROMA_DATA_2 v6, v7, x0, 5
+ STORE_CHROMA_DATA_2 v6, v7, x0, 6
+ STORE_CHROMA_DATA_2 v6, v7, x0, 7
- STORE_CHROMA_DATA_2 v6, v7, x1, 8
- STORE_CHROMA_DATA_2 v6, v7, x1, 9
- STORE_CHROMA_DATA_2 v6, v7, x1, 10
- STORE_CHROMA_DATA_2 v6, v7, x1, 11
- STORE_CHROMA_DATA_2 v6, v7, x1, 12
- STORE_CHROMA_DATA_2 v6, v7, x1, 13
- STORE_CHROMA_DATA_2 v6, v7, x1, 14
- STORE_CHROMA_DATA_2 v6, v7, x1, 15
- DeblockChromaEq4H_AArch64_neon_end:
+ STORE_CHROMA_DATA_2 v6, v7, x1, 8
+ STORE_CHROMA_DATA_2 v6, v7, x1, 9
+ STORE_CHROMA_DATA_2 v6, v7, x1, 10
+ STORE_CHROMA_DATA_2 v6, v7, x1, 11
+ STORE_CHROMA_DATA_2 v6, v7, x1, 12
+ STORE_CHROMA_DATA_2 v6, v7, x1, 13
+ STORE_CHROMA_DATA_2 v6, v7, x1, 14
+ STORE_CHROMA_DATA_2 v6, v7, x1, 15
+ DeblockChromaEq4H_AArch64_neon_end:
WELS_ASM_ARCH64_FUNC_END
WELS_ASM_ARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon
- // Checking the nzc status
- BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status
- // For checking bS[I] = 2
- movi v0.16b, #0
- cmgt v16.16b, v16.16b, v0.16b
- cmgt v17.16b, v17.16b, v0.16b
- movi v0.16b, #2
+ // Checking the nzc status
+ BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status
+ // For checking bS[I] = 2
+ movi v0.16b, #0
+ cmgt v16.16b, v16.16b, v0.16b
+ cmgt v17.16b, v17.16b, v0.16b
+ movi v0.16b, #2
- and v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top
- and v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left
+ and v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top
+ and v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left
- // Checking the mv status
- BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status
- // For checking bS[I] = 1
- movi v0.16b, #1
- and v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top
- and v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left
- // Check bS[I] is '1' or '2'
- umax v1.16b, v18.16b, v16.16b
- umax v0.16b, v19.16b, v17.16b
- st1 {v0.16b, v1.16b}, [x4]
+ // Checking the mv status
+ BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status
+ // For checking bS[I] = 1
+ movi v0.16b, #1
+ and v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top
+ and v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left
+ // Check bS[I] is '1' or '2'
+ umax v1.16b, v18.16b, v16.16b
+ umax v0.16b, v19.16b, v17.16b
+ st1 {v0.16b, v1.16b}, [x4]
WELS_ASM_ARCH64_FUNC_END