ref: d8202cf38f2145a39e22021f70e033cd5dc6401d
parent: cdce1b73ca1a39289e75f19e3ab97539e90e2436
author: Martin Storsjö <martin@martin.st>
date: Fri Mar 27 06:54:14 EDT 2015
Remove apple specific versions of arm64 macros with arguments The apple assembler for arm64 can handle the gnu binutils style macros just fine, so there is no need to duplicate all of these macros in two syntaxes, when the new one works fine in all cases. We already require a new enough assembler to support the gnu binutils style features since we use the .rept directive in a few places.
--- a/codec/common/arm64/copy_mb_aarch64_neon.S
+++ b/codec/common/arm64/copy_mb_aarch64_neon.S
@@ -33,80 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.d}[0], [$4], $5
- ld1 {$1.d}[0], [$4], $5
- ld1 {$2.d}[0], [$4], $5
- ld1 {$3.d}[0], [$4], $5
-// }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.d}[0], [$4], $5
- st1 {$1.d}[0], [$4], $5
- st1 {$2.d}[0], [$4], $5
- st1 {$3.d}[0], [$4], $5
-// }
-.endm
-
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.8b}, [$4], $5
- ld1 {$1.8b}, [$4], $5
- ld1 {$2.8b}, [$4], $5
- ld1 {$3.8b}, [$4], $5
-// }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.8b}, [$4], $5
- st1 {$1.8b}, [$4], $5
- st1 {$2.8b}, [$4], $5
- st1 {$3.8b}, [$4], $5
-// }
-.endm
-
-.macro LOAD16_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.2d}, [$4], $5
- ld1 {$1.2d}, [$4], $5
- ld1 {$2.2d}, [$4], $5
- ld1 {$3.2d}, [$4], $5
-// }
-.endm
-
-.macro STORE16_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.2d}, [$4], $5
- st1 {$1.2d}, [$4], $5
- st1 {$2.2d}, [$4], $5
- st1 {$3.2d}, [$4], $5
-// }
-.endm
-
-.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.16b}, [$4], $5
- ld1 {$1.16b}, [$4], $5
- ld1 {$2.16b}, [$4], $5
- ld1 {$3.16b}, [$4], $5
-// }
-.endm
-
-.macro STORE16_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.16b}, [$4], $5
- st1 {$1.16b}, [$4], $5
- st1 {$2.16b}, [$4], $5
- st1 {$3.16b}, [$4], $5
-// }
-.endm
-
-#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: $0~$3, src*, src_stride
ld1 {\arg0\().d}[0], [\arg4], \arg5
@@ -178,8 +104,6 @@
st1 {\arg3\().16b}, [\arg4], \arg5
// }
.endm
-
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -33,266 +33,7 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro MASK_MATRIX
- uabd $6.16b, $1.16b, $2.16b
- cmhi $6.16b, $4.16b, $6.16b
-
- uabd $4.16b, $0.16b, $1.16b
- cmhi $4.16b, $5.16b, $4.16b
- and $6.16b, $6.16b, $4.16b
-
- uabd $4.16b, $3.16b, $2.16b
- cmhi $4.16b, $5.16b, $4.16b
- and $6.16b, $6.16b, $4.16b
-.endm
-
-.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
- //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
- urhadd $8.16b, $2.16b, $3.16b
- uhadd $8.16b, $0.16b, $8.16b
- usubl $9.8h, $8.8b, $1.8b
- sqxtn $9.8b, $9.8h
- usubl2 $8.8h, $8.16b, $1.16b
- sqxtn2 $9.16b, $8.8h
- smax $8.16b, $9.16b, $5.16b
-//
- smin $8.16b, $8.16b, $6.16b
- uabd $9.16b, $0.16b, $2.16b
- cmhi $9.16b, $4.16b, $9.16b
- and $8.16b, $8.16b, $9.16b
- and $8.16b, $8.16b, $7.16b
- add $8.16b, $1.16b, $8.16b
- abs $9.16b, $9.16b
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0_1
- usubl $5.8h, $0.8b, $3.8b
- usubl $6.8h, $2.8b, $1.8b
- shl $6.8h, $6.8h, #2
- add $5.8h, $5.8h, $6.8h
- sqrshrn $4.8b, $5.8h, #3
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0_2
- usubl2 $5.8h, $0.16b, $3.16b
- usubl2 $6.8h, $2.16b, $1.16b
- shl $6.8h, $6.8h, #2
- add $5.8h, $5.8h, $6.8h
- sqrshrn2 $4.16b, $5.8h, #3
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
- cmge $1.16b, $0.16b, #0
- and $1.16b, $0.16b, $1.16b
- sub $0.16b, $1.16b, $0.16b
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0_1
- uaddl $8.8h, $1.8b, $2.8b
- uaddl $9.8h, $3.8b, $4.8b
- add $9.8h, $9.8h, $8.8h
-
- uaddl $8.8h, $0.8b, $1.8b
- shl $8.8h, $8.8h, #1
- add $8.8h, $9.8h, $8.8h
-
- rshrn $0.8b, $9.8h, #2
- rshrn $7.8b, $8.8h, #3
- shl $9.8h, $9.8h, #1
- usubl $8.8h, $5.8b, $1.8b
- add $9.8h, $8.8h, $9.8h
-
- uaddl $8.8h, $2.8b, $5.8b
- uaddw $8.8h, $8.8h, $2.8b
- uaddw $8.8h, $8.8h, $3.8b
-
- rshrn $9.8b, $9.8h, #3
- rshrn $8.8b, $8.8h, #2
- bsl $6.8b, $9.8b, $8.8b
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0_2
- uaddl2 $8.8h, $1.16b, $2.16b
- uaddl2 $9.8h, $3.16b, $4.16b
- add $9.8h, $9.8h, $8.8h
-
- uaddl2 $8.8h, $0.16b, $1.16b
- shl $8.8h, $8.8h, #1
- add $8.8h, $9.8h, $8.8h
-
- rshrn2 $0.16b, $9.8h, #2
- rshrn2 $7.16b, $8.8h, #3
- shl $9.8h, $9.8h, #1
- usubl2 $8.8h, $5.16b, $1.16b
- add $9.8h, $8.8h, $9.8h
-
- uaddl2 $8.8h, $2.16b, $5.16b
- uaddw2 $8.8h, $8.8h, $2.16b
- uaddw2 $8.8h, $8.8h, $3.16b
-
- rshrn2 $9.16b, $9.8h, #3
- rshrn2 $8.16b, $8.8h, #2
- bsl $6.16b, $9.16b, $8.16b
-.endm
-
-
-.macro DIFF_CHROMA_EQ4_P0Q0_1
- uaddl $4.8h, $0.8b, $3.8b
- shl $4.8h, $4.8h, #1
- usubl $5.8h, $1.8b, $3.8b
- add $5.8h, $5.8h, $4.8h
- rshrn $6.8b, $5.8h, #2
- usubl $5.8h, $2.8b, $0.8b
- add $5.8h, $5.8h, $4.8h
- rshrn $7.8b, $5.8h, #2
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0_2
- uaddl2 $4.8h, $0.16b, $3.16b
- shl $4.8h, $4.8h, #1
- usubl2 $5.8h, $1.16b, $3.16b
- add $5.8h, $5.8h, $4.8h
- rshrn2 $6.16b, $5.8h, #2
- usubl2 $5.8h, $2.16b, $0.16b
- add $5.8h, $5.8h, $4.8h
- rshrn2 $7.16b, $5.8h, #2
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
- mov $3.16b, $2.16b
- bsl $3.16b, $0.16b, $1.16b
-.endm
-
-.macro LOAD_LUMA_DATA_3
- ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
- ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
-.endm
-
-.macro LOAD_LUMA_DATA_4
- ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
- ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
-.endm
-
-.macro STORE_LUMA_DATA_4
- st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
- st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
-.endm
-
-.macro STORE_LUMA_DATA_3
- st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
- st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
-.endm
-
-.macro LOAD_CHROMA_DATA_4
- ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
-.endm
-
-.macro STORE_CHROMA_DATA_2
- st2 {$0.b, $1.b} [$3], [$2], x2
-.endm
-
-.macro ZERO_JUMP_END
- mov $1, $0.d[0]
- mov $2, $0.d[1]
- orr $1, $1, $2
- cbz $1, $3
-.endm
-
-.macro BS_NZC_CHECK
- ld1 {v0.16b}, [$0]
- //Arrange the input data --- TOP
- ands x6, $1, #2
- cbz x6, bs_nzc_check_jump0
- sub x6, $0, $2, lsl #4
- sub x6, x6, $2, lsl #3
- add x6, x6, #12
- ld1 {v1.s} [3], [x6]
-
- bs_nzc_check_jump0:
- ext v1.16b, v1.16b, v0.16b, #12
- add $3.16b, v0.16b, v1.16b
-
- // Arrange the input data --- LEFT
- ands x6, $1, #1
- cbz x6, bs_nzc_check_jump1
-
- sub x6, $0, #21
- add x7, x6, #4
- ld1 {v1.b} [12], [x6]
- add x6, x7, #4
- ld1 {v1.b} [13], [x7]
- add x7, x6, #4
- ld1 {v1.b} [14], [x6]
- ld1 {v1.b} [15], [x7]
-
-bs_nzc_check_jump1:
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ext v1.16b, v1.16b, v0.16b, #12
- add $4.16b, v0.16b, v1.16b
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
- mov w6, #4
- sabd v20.8h, $0.8h, $1.8h
- sabd v21.8h, $1.8h, $2.8h
- dup $0.8h, w6
- sabd v22.8h, $2.8h, $3.8h
- sabd v23.8h, $3.8h, $4.8h
-
- cmge v20.8h, v20.8h, $0.8h
- cmge v21.8h, v21.8h, $0.8h
- cmge v22.8h, v22.8h, $0.8h
- cmge v23.8h, v23.8h, $0.8h
-
- addp v20.8h, v20.8h, v21.8h
- addp v21.8h, v22.8h, v23.8h
-
- addhn $5.8b, v20.8h, v20.8h
- addhn2 $5.16b, v21.8h, v21.8h
-.endm
-
-.macro BS_MV_CHECK
- ldp q0, q1, [$0], #32
- ldp q2, q3, [$0]
- sub $0, $0, #32
- // Arrenge the input data --- TOP
- ands x6, $1, #2
- cbz x6, bs_mv_check_jump0
- sub x6, $0, $2, lsl #6
- add x6, x6, #48
- ld1 {v4.16b}, [x6]
-bs_mv_check_jump0:
- BS_COMPARE_MV v4, v0, v1, v2, v3, $3
- // Arrange the input data --- LEFT
- ands x6, $1, #1
- cbz x6, bs_mv_check_jump1
- sub x6, $0, #52
- add x7, x6, #16
- ld1 {v4.s} [0], [x6]
- add x6, x7, #16
- ld1 {v4.s} [1], [x7]
- add x7, x6, #16
- ld1 {v4.s} [2], [x6]
- ld1 {v4.s} [3], [x7]
-bs_mv_check_jump1:
- zip1 $5.4s, v0.4s, v2.4s
- zip2 $6.4s, v0.4s, v2.4s
- zip1 v0.4s, v1.4s, v3.4s
- zip2 v2.4s, v1.4s, v3.4s
- zip2 v1.4s, $5.4s, v0.4s
- zip1 v0.4s, $5.4s, v0.4s
- zip2 v3.4s, $6.4s, v2.4s
- zip1 v2.4s, $6.4s, v2.4s
- BS_COMPARE_MV v4, v0, v1, v2, v3, $4
-.endm
-
-#else
-
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
cmhi \arg6\().16b, \arg4\().16b, \arg6\().16b
@@ -549,7 +290,6 @@
zip1 v2.4s, \arg6\().4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
mov w1, #1
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -35,181 +35,6 @@
.align 4
filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
-#ifdef __APPLE__
-
-.macro FILTER_6TAG_8BITS1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun $6.8b, v18.8h, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS2
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 $6.16b, v18.8h, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun $6.8b, v18.8h, #5
- uaddl v19.8h, $2.8b, $6.8b
- rshrn $6.8b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 $6.16b, v18.8h, #5
- uaddl2 v19.8h, $2.16b, $6.16b
- rshrn2 $6.16b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun $6.8b, v18.8h, #5
- uaddl v19.8h, $3.8b, $6.8b
- rshrn $6.8b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 $6.16b, v18.8h, #5
- uaddl2 v19.8h, $3.16b, $6.16b
- rshrn2 $6.16b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS1
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS2
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS1
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS2
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext $4.16b, $0.16b, $1.16b, #4 //src[0]
- ext $3.16b, $0.16b, $1.16b, #6 //src[1]
- add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
-
- ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
- ext $2.16b, $0.16b, $1.16b, #8 //src[2]
- add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
-
- ext $2.16b, $0.16b, $1.16b, #10 //src[3]
- add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
-// }
-.endm
-
-.macro AVERAGE_TWO_8BITS1
-// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, $2.8b, $1.8b
- rshrn $0.8b, v30.8h, #1
-// }
-.endm
-
-.macro AVERAGE_TWO_8BITS2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, $2.16b, $1.16b
- rshrn2 $0.16b, v30.8h, #1
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
- uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
- mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
- addv $3, $2.4h
- sqrshrun $0.8b, $0.8h, #5
-// }
-.endm
-
-.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
- ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
- ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
- rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
- add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
- smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
- saddlv $5, $3.4s
- //sshr $0.2d, $0.2d, #4
- sqrshrun $0.2s, $0.2d, #10
- uqxtn $0.4h, $0.4s
- uqxtn $0.8b, $0.8h
- // }
-.endm
-
-#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
@@ -382,7 +207,6 @@
uqxtn \arg0\().8b, \arg0\().8h
// }
.endm
-#endif
//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -32,40 +32,7 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
- ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
- sshr $8.4h, $1.4h, #1
- sshr $9.4h, $3.4h, #1
- ssubl $6.4s, $8.4h, $3.4h //int32 e[i][2] = (src[1]>>1)-src[3];
- saddl $7.4s, $1.4h, $9.4h //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
- add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
- sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
- sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
- sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
- sshr $6.4s, $1.4s, #1
- sshr $7.4s, $3.4s, #1
- sub $6.4s, $6.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- add $7.4s, $1.4s, $7.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
@@ -98,7 +65,6 @@
add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
-#endif
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
--- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
@@ -93,93 +93,6 @@
trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
.endm
-#ifdef __APPLE__
-.macro SELECT_BEST_COST
- cmp w1, $0
- csel $0, $0, w1, $2
- cset w7, $1
- cmp w2, $0
- mov w6, #2
- csel $0, $0, w2, $2
- csel w7, w7, w6, $2
-.endm
-
-.macro SELECT_BEST_COST_PREFER_HIGHER arg0
- SELECT_BEST_COST \arg0, ls, hi
-.endm
-
-.macro SELECT_BEST_COST_PREFER_LOWER arg0
- SELECT_BEST_COST \arg0, lo, hs
-.endm
-
-.macro LOAD_CHROMA_DATA
- sub x9, $0, x1
- ld1 {$1}, [x9] //top_cb
- sub x9, $0, #1
- ld1 {$2}[8], [x9], x1
- ld1 {$2}[9], [x9], x1
- ld1 {$2}[10], [x9], x1
- ld1 {$2}[11], [x9], x1
- ld1 {$2}[12], [x9], x1
- ld1 {$2}[13], [x9], x1
- ld1 {$2}[14], [x9], x1
- ld1 {$2}[15], [x9], x1 //left_cb
-.endm
-
-.macro LOAD_8X4_DATA
- //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
- ld1 {v0.8b}, [$0], x3
- ld1 {v1.8b}, [$0], x3
- ld1 {v0.d}[1], [$0], x3
- ld1 {v1.d}[1], [$0], x3
- trn1 v2.4s, v0.4s, v1.4s
- trn2 v1.4s, v0.4s, v1.4s
- trn1 v20.2d, v2.2d, v1.2d
- trn2 v21.2d, v2.2d, v1.2d
-.endm
-
-.macro HDM_TRANSFORM_4X4_L0
- //Do the vertical transform
- uadd$9 v0.8h, $0, $1
- usub$9 v1.8h, $0, $1
- trn1 v3.2d, v0.2d, v1.2d
- trn2 v1.2d, v0.2d, v1.2d
- add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
- sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
-
- //Do the horizontal transform
- trn1 v0.4s, v4.4s, v5.4s
- trn2 v1.4s, v4.4s, v5.4s
- add v4.8h, v0.8h, v1.8h
- sub v5.8h, v0.8h, v1.8h
- trn1 v0.8h, v4.8h, v5.8h
- trn2 v1.8h, v4.8h, v5.8h
- add v4.8h, v0.8h, v1.8h
- sub v5.8h, v0.8h, v1.8h
-
- //16x16_v
- trn1 v0.2s, v4.2s, v5.2s
- trn2 v1.2s, v4.2s, v5.2s
- sabal $5, v0.4h, $2
- sabal $5, v1.4h, $8.4h
- sabal2 $5, v4.8h, $8.8h
- sabal2 $5, v5.8h, $8.8h
-
- //16x16_h
- ins v3.d[0], v4.d[1]
- trn1 v0.4h, v4.4h, v3.4h
- trn2 v1.4h, v4.4h, v3.4h
- sabal $6, v0.4h, $3
- sabdl v4.4s, v1.4h, $8.4h
- sabal v4.4s, v5.4h, $8.4h
- sabal2 v4.4s, v5.8h, $8.8h
- add $6, $6, v4.4s
-
- //16x16_dc_both
- sabal $7, v0.4h, $4
- add $7, $7, v4.4s
-.endm
-#else
.macro SELECT_BEST_COST arg0, arg1, arg2
cmp w1, \arg0
csel \arg0, \arg0, w1, \arg2
@@ -265,7 +178,6 @@
sabal \arg7, v0.4h, \arg4
add \arg7, \arg7, v4.4s
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
ldr x11, [sp, #0]
--- a/codec/encoder/core/arm64/pixel_aarch64_neon.S
+++ b/codec/encoder/core/arm64/pixel_aarch64_neon.S
@@ -68,89 +68,6 @@
ld1 {v7.16b}, [x0], x1
.endm
-#ifdef __APPLE__
-.macro LOAD_8X8_2
- ld1 {v16.8b}, [$0], x3
- ld1 {v17.8b}, [$0], x3
- ld1 {v18.8b}, [$0], x3
- ld1 {v19.8b}, [$0], x3
- ld1 {v20.8b}, [$0], x3
- ld1 {v21.8b}, [$0], x3
- ld1 {v22.8b}, [$0], x3
- ld1 {v23.8b}, [$0], x3
-.endm
-
-.macro CALC_ABS_8X8_1
- uab$1l $0, v0.8b, v16.8b
- uabal $0, v1.8b, v17.8b
- uabal $0, v2.8b, v18.8b
- uabal $0, v3.8b, v19.8b
- uabal $0, v4.8b, v20.8b
- uabal $0, v5.8b, v21.8b
- uabal $0, v6.8b, v22.8b
- uabal $0, v7.8b, v23.8b
-.endm
-
-.macro CALC_ABS_8X8_2
- uab$0l v29.8h, v0.8b, v18.8b
- uabal v29.8h, v1.8b, v19.8b
- uabal v29.8h, v2.8b, v20.8b
- uabal v29.8h, v3.8b, v21.8b
- uabal v29.8h, v4.8b, v22.8b
- uabal v29.8h, v5.8b, v23.8b
- uabal v29.8h, v6.8b, v24.8b
- uabal v29.8h, v7.8b, v25.8b
-.endm
-
-.macro LOAD_16X8_2
- ld1 {v16.16b}, [$0], x3
- ld1 {v17.16b}, [$0], x3
- ld1 {v18.16b}, [$0], x3
- ld1 {v19.16b}, [$0], x3
- ld1 {v20.16b}, [$0], x3
- ld1 {v21.16b}, [$0], x3
- ld1 {v22.16b}, [$0], x3
- ld1 {v23.16b}, [$0], x3
-.endm
-
-.macro CALC_ABS_16X8_1
- uab$1l $0, v0.8b, v16.8b
- uabal2 $0, v0.16b,v16.16b
- uabal $0, v1.8b, v17.8b
- uabal2 $0, v1.16b,v17.16b
- uabal $0, v2.8b, v18.8b
- uabal2 $0, v2.16b,v18.16b
- uabal $0, v3.8b, v19.8b
- uabal2 $0, v3.16b,v19.16b
- uabal $0, v4.8b, v20.8b
- uabal2 $0, v4.16b,v20.16b
- uabal $0, v5.8b, v21.8b
- uabal2 $0, v5.16b,v21.16b
- uabal $0, v6.8b, v22.8b
- uabal2 $0, v6.16b,v22.16b
- uabal $0, v7.8b, v23.8b
- uabal2 $0, v7.16b,v23.16b
-.endm
-
-.macro CALC_ABS_16X8_2
- uab$0l v29.8h, v0.8b, v18.8b
- uabal2 v29.8h, v0.16b,v18.16b
- uabal v29.8h, v1.8b, v19.8b
- uabal2 v29.8h, v1.16b,v19.16b
- uabal v29.8h, v2.8b, v20.8b
- uabal2 v29.8h, v2.16b,v20.16b
- uabal v29.8h, v3.8b, v21.8b
- uabal2 v29.8h, v3.16b,v21.16b
- uabal v29.8h, v4.8b, v22.8b
- uabal2 v29.8h, v4.16b,v22.16b
- uabal v29.8h, v5.8b, v23.8b
- uabal2 v29.8h, v5.16b,v23.16b
- uabal v29.8h, v6.8b, v24.8b
- uabal2 v29.8h, v6.16b,v24.16b
- uabal v29.8h, v7.8b, v25.8b
- uabal2 v29.8h, v7.16b,v25.16b
-.endm
-#else
.macro LOAD_8X8_2 arg0
ld1 {v16.8b}, [\arg0], x3
ld1 {v17.8b}, [\arg0], x3
@@ -232,7 +149,6 @@
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
sxtw x1, w1
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -33,247 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ZERO_COUNT_IN_2_QUARWORD
-// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
- cmeq $0.8h, $0.8h, #0
- cmeq $1.8h, $1.8h, #0
- uzp1 $0.16b, $0.16b, $1.16b
- ushr $0.16b, $0.16b, 7
- addv $2, $0.16b
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf
- eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
- saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
- smull $4.4s, $1.4h, $2.4h
- smull2 $5.4s, $1.8h, $2.8h
- shrn $1.4h, $4.4s, #16
- shrn2 $1.8h, $5.4s, #16
-
- cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
- bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
- shl $3.8h, $3.8h, #1
- sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf
- eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
- saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
- smull $4.4s, $1.4h, $2.4h
- smull2 $5.4s, $1.8h, $2.8h
- shrn $1.4h, $4.4s, #16
- shrn2 $1.8h, $5.4s, #16
-
- cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
- bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
- shl $3.8h, $3.8h, #1
- mov $6.16b, $1.16b
- sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf
- saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
- smull $4.4s, $1.4h, $2.4h
- shrn $1.4h, $4.4s, #16
-
- cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
- bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
- shl $3.8h, $3.8h, #1
- sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF
-// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
- umax $0.8h, $0.8h, $1.8h
- umaxv $4, $0.8h
- umax $2.8h, $2.8h, $3.8h
- umaxv $5, $2.8h
-// }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
- sshr $1.2d, $0.2d, #32
- add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- zip1 $1.4h, $2.4h, $1.4h
-// }
-.endm
-
-
-.macro DC_ZERO_COUNT_IN_DUALWORD
-// { // input: coef, dst_d, working_d (all 0x01)
- cmeq $0.4h, $0.4h, #0
- and $0.8b, $0.8b, $2.8b
- addv $1, $0.4h
-// }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1
- uzp2 $1.4s, $0.4s, $0.4s
- uzp1 $0.4s, $0.4s, $0.4s
- add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
- sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
- zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
-
- uzp2 $1.4s, $2.4s, $2.4s
- uzp1 $0.4s, $2.4s, $2.4s
- add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
- sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
- rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
- zip1 $0.4s, $2.4s, $1.4s
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13]
- uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15]
-
- uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
- uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
- zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
- zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
-// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
- trn1 $4.8h, v0.8h, v1.8h
- trn2 $5.8h, v0.8h, v1.8h
- trn1 $6.8h, v2.8h, v3.8h
- trn2 $7.8h, v2.8h, v3.8h
-
- trn1 $0.4s, v4.4s, v6.4s
- trn2 $2.4s, v4.4s, v6.4s
- trn1 $1.4s, v5.4s, v7.4s
- trn2 $3.4s, v5.4s, v7.4s
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
-// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
- mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7]
- mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15]
- uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13]
- uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15]
-
- uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
- uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
- zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
- zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
-// }
-.endm
-
-.macro LOAD_4x4_DATA_FOR_DCT
- ld1 {$0.s}[0], [$2], $3
- ld1 {$0.s}[1], [$2], $3
- ld1 {$0.s}[2], [$2], $3
- ld1 {$0.s}[3], [$2]
-
- ld1 {$1.s}[0], [$4], $5
- ld1 {$1.s}[1], [$4], $5
- ld1 {$1.s}[2], [$4], $5
- ld1 {$1.s}[3], [$4]
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-// { // input: src_d[0]~[3], working: [4]~[7]
- add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3];
- sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3];
- add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2];
- sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2];
-
- add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1];
- sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1];
- shl $1.8h, $7.8h, #1
- shl $3.8h, $6.8h, #1
- add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2];
- sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
-.endm
-
-.macro LOAD_8x4_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- ld1 {$0.d}[0], [$8], x2
- ld1 {$1.d}[0], [$8], x2
- ld1 {$2.d}[0], [$8], x2
- ld1 {$3.d}[0], [$8], x2
-
- ld1 {$4.d}[0], [$9], x4
- ld1 {$5.d}[0], [$9], x4
- ld1 {$6.d}[0], [$9], x4
- ld1 {$7.d}[0], [$9], x4
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2];
- sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2];
- sshr $6.8h, $1.8h, #1
- sshr $7.8h, $3.8h, #1
- sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3];
- add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3];
- add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2];
- sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2];
- sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro ROW_TRANSFORM_0_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
- ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
- ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3];
- saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3];
-// }
-.endm
-
-.macro COL_TRANSFORM_0_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
- sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
- sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
- add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
- sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
- sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
- uxtl $3.8h, $0.8b
- uxtl2 $4.8h, $0.16b
- add $3.8h, $3.8h, $1.8h
- add $4.8h, $4.8h, $2.8h
- sqxtun $0.8b, $3.8h
- sqxtun2 $0.16b,$4.8h
-// }
-.endm
-#else
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
cmeq \arg0\().8h, \arg0\().8h, #0
@@ -518,7 +277,6 @@
sqxtun2 \arg0\().16b,\arg4\().8h
// }
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]
--- a/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
+++ b/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
@@ -33,29 +33,6 @@
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ABS_SUB_SUM_16BYTES
- ld1 {v0.16b}, [x0], x4
- ld1 {v1.16b}, [x1], x4
- uabal $0, v0.8b, v1.8b
- uabal2 $1, v0.16b,v1.16b
-.endm
-
-.macro ABS_SUB_SUM_8x16BYTES
- ld1 {v0.16b}, [x0], x4
- ld1 {v1.16b}, [x1], x4
- uabdl $0, v0.8b, v1.8b
- uabdl2 $1, v0.16b,v1.16b
-
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
-.endm
-#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
@@ -77,7 +54,6 @@
ABS_SUB_SUM_16BYTES \arg0, \arg1
ABS_SUB_SUM_16BYTES \arg0, \arg1
.endm
-#endif
/*
* void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,