ref: 7751d756b0fea008ac5d67a72620a9717e0778b2
parent: ae0f6cc4652d5c1d0b5fb75b1bb84d8a060fc2eb
parent: 0995390c4a034ad16c83a52ea3a56513aeb6d8a6
author: zhilwang <zhilwang@cisco.com>
date: Sun Mar 29 21:02:02 EDT 2015
Merge pull request #1883 from mstorsjo/arm-asm-cleanup Remove duplication in arm assembly
--- a/codec/common/arm/arm_arch_common_macro.S
+++ b/codec/common/arm/arm_arch_common_macro.S
@@ -36,6 +36,8 @@
#ifdef __APPLE__
+.text
+
.macro WELS_ASM_FUNC_BEGIN
.align 2
.arm
--- a/codec/common/arm/copy_mb_neon.S
+++ b/codec/common/arm/copy_mb_neon.S
@@ -31,46 +31,8 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4,:128], $5
- vld1.64 {$1}, [$4,:128], $5
- vld1.64 {$2}, [$4,:128], $5
- vld1.64 {$3}, [$4,:128], $5
-// }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4,:128], $5
- vst1.64 {$1}, [$4,:128], $5
- vst1.64 {$2}, [$4,:128], $5
- vst1.64 {$3}, [$4,:128], $5
-// }
-.endm
-
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- vld1.64 {$0}, [$4], $5
- vld1.64 {$1}, [$4], $5
- vld1.64 {$2}, [$4], $5
- vld1.64 {$3}, [$4], $5
-// }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- vst1.64 {$0}, [$4], $5
- vst1.64 {$1}, [$4], $5
- vst1.64 {$2}, [$4], $5
- vst1.64 {$3}, [$4], $5
-// }
-.endm
-#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: \arg0~\arg3, src*, src_stride
vld1.64 {\arg0}, [\arg4,:128], \arg5
@@ -106,8 +68,6 @@
vst1.64 {\arg3}, [\arg4], \arg5
// }
.endm
-
-#endif
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -31,129 +31,9 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-.macro JMP_IF_128BITS_IS_ZERO
- vorr.s16 $2, $0, $1
- vmov r3, r2, $2
- orr r3, r3, r2
- cmp r3, #0
-.endm
-
-.macro MASK_MATRIX
- vabd.u8 $6, $1, $2
- vcgt.u8 $6, $4, $6
-
- vabd.u8 $4, $0, $1
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
-
- vabd.u8 $4, $3, $2
- vclt.u8 $4, $4, $5
- vand.u8 $6, $6, $4
-.endm
-
-
-.macro DIFF_LUMA_LT4_P1_Q1
- vmov.i8 $9, #128
- vrhadd.u8 $8, $2, $3
- vhadd.u8 $8, $0, $8
- vsub.s8 $8, $8, $9
- vsub.s8 $9, $1, $9
- vqsub.s8 $8, $8, $9
- vmax.s8 $8, $8, $5
- vmin.s8 $8, $8, $6
- vabd.u8 $9, $0, $2
- vclt.u8 $9, $9, $4
- vand.s8 $8, $8, $9
- vand.s8 $8, $8, $7
- vadd.u8 $8, $1, $8
- vabs.s8 $9, $9
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0
- vsubl.u8 $5, $0, $3
- vsubl.u8 $6, $2, $1
- vshl.s16 $6, $6, #2
- vadd.s16 $5, $5, $6
- vqrshrn.s16 $4, $5, #3
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0
- vaddl.u8 q4, $1, $2
- vaddl.u8 q5, $3, $4
- vadd.u16 q5, q4, q5
-
- vaddl.u8 q4, $0, $1
- vshl.u16 q4, q4, #1
- vadd.u16 q4, q5, q4
-
- vrshrn.u16 $0, q5, #2
- vrshrn.u16 $7, q4, #3
-
- vshl.u16 q5, q5, #1
- vsubl.u8 q4, $5, $1
- vadd.u16 q5, q4,q5
-
- vaddl.u8 q4, $2, $5
- vaddw.u8 q4, q4, $2
- vaddw.u8 q4, q4, $3
-
- vrshrn.u16 d10,q5, #3
- vrshrn.u16 d8, q4, #2
- vbsl.u8 $6, d10, d8
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
- vmov $3, $2
- vbsl.u8 $3, $0, $1
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0
- vaddl.u8 $4, $0, $3
- vaddw.u8 $5, $4, $1
- vaddw.u8 $6, $4, $2
- vaddw.u8 $5, $5, $0
-
- vaddw.u8 $6, $6, $3
- vrshrn.u16 $7, $5, #2
- vrshrn.u16 $8, $6, #2
-.endm
-
-.macro LOAD_CHROMA_DATA_4
- vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro STORE_CHROMA_DATA_4
- vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
- vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro LOAD_LUMA_DATA_3
- vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
- vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro STORE_LUMA_DATA_4
- vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
- vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
-.endm
-
-.macro STORE_LUMA_DATA_3
- vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
- vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
- vcge.s8 $1, $0, #0
- vand $1, $0, $1
- vsub.s8 $0, $1, $0
-.endm
-#else
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
vorr.s16 \arg2, \arg0, \arg1
vmov r3, r2, \arg2
@@ -270,7 +150,6 @@
vand \arg1, \arg0, \arg1
vsub.s8 \arg0, \arg1, \arg0
.endm
-#endif
WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
vpush {q4-q7}
@@ -842,100 +721,6 @@
vst1.64 {d0,d1,d2}, [r0]
WELS_ASM_FUNC_END
-#ifdef __APPLE__
-.macro BS_NZC_CHECK
- vld1.8 {d0,d1}, [$0]
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_nzc_check_jump0
-
- sub r6, $0, $2, lsl #4
- sub r6, r6, $2, lsl #3
- add r6, #12
- vld1.32 d3[1], [r6]
-
-bs_nzc_check_jump0:
- vext.8 q1, q1, q0, #12
- vadd.u8 $3, q0, q1
-
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_nzc_check_jump1
-
- sub r6, $0, #21
- add r7, r6, #4
- vld1.8 d3[4], [r6]
- add r6, r7, #4
- vld1.8 d3[5], [r7]
- add r7, r6, #4
- vld1.8 d3[6], [r6]
- vld1.8 d3[7], [r7]
-
-bs_nzc_check_jump1:
- vzip.8 d0, d1
- vzip.8 d0, d1
- vext.8 q1, q1, q0, #12
- vadd.u8 $4, q0, q1
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
- mov r6, #4
- vabd.s16 q8, $0, $1
- vabd.s16 q9, $1, $2
- vdup.s16 $0, r6
- vabd.s16 q10, $2, $3
- vabd.s16 q11, $3, $4
-
- vcge.s16 q8, $0
- vcge.s16 q9, $0
- vcge.s16 q10, $0
- vcge.s16 q11, $0
-
- vpadd.i16 d16, d16, d17
- vpadd.i16 d17, d18, d19
- vpadd.i16 d18, d20, d21
- vpadd.i16 d19, d22, d23
-
- vaddhn.i16 $5, q8, q8
- vaddhn.i16 $6, q9, q9
-.endm
-
-.macro BS_MV_CHECK
- vldm $0, {q0,q1,q2,q3}
-
- /* Arrenge the input data --- TOP */
- ands r6, $1, #2
- beq bs_mv_check_jump0
-
- sub r6, $0, $2, lsl #6
- add r6, #48
- vld1.8 {d8, d9}, [r6]
-
-bs_mv_check_jump0:
- BS_COMPARE_MV q4, q0, q1, q2, q3, $3, $4
-
- /* Arrenge the input data --- LEFT */
- ands r6, $1, #1
- beq bs_mv_check_jump1
-
- sub r6, $0, #52
- add r7, r6, #16
- vld1.32 d8[0], [r6]
- add r6, r7, #16
- vld1.32 d8[1], [r7]
- add r7, r6, #16
- vld1.32 d9[0], [r6]
- vld1.32 d9[1], [r7]
-
-bs_mv_check_jump1:
- vzip.32 q0, q2
- vzip.32 q1, q3
- vzip.32 q0, q1
- vzip.32 q2, q3
- BS_COMPARE_MV q4, q0, q1, q2, q3, $5, $6
-.endm
-#else
.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
vld1.8 {d0,d1}, [\arg0]
/* Arrenge the input data --- TOP */
@@ -1028,7 +813,6 @@
vzip.32 q2, q3
BS_COMPARE_MV q4, q0, q1, q2, q3, \arg5, \arg6
.endm
-#endif
WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
--- a/codec/common/arm/expand_picture_neon.S
+++ b/codec/common/arm/expand_picture_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
--- a/codec/common/arm/intra_pred_common_neon.S
+++ b/codec/common/arm/intra_pred_common_neon.S
@@ -32,7 +32,6 @@
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -31,120 +31,8 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-.macro AVERAGE_TWO_8BITS
-// { // input:dst_d, src_d A and B; working: q13
- vaddl.u8 q13, $2, $1
- vrshrn.u16 $0, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
- vrev64.8 $2, $0 // X[5][4][3][2][1][0]O
- vaddl.u8 $3, $0, $2 // each 16bits, *[50][41][32][23][14][05]*
- vmul.s16 $0, $2, $1 // 0+1*[50]-5*[41]+20[32]
- vpadd.s16 $0, $0, $0
- vpadd.s16 $0, $0, $0
- vqrshrun.s16 $0, $4, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $2, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
- vaddl.u8 q12, $0, $5 //q12=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 q12, q13, $7 //q12 += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 q12, q13, $8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
- vqrshrun.s16 $6, q12, #5
- vaddl.u8 q13, $3, $6
- vrshrn.u16 $6, q13, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
- vaddl.u8 $6, $0, $5 //dst_q=src[-2]+src[3]
- vaddl.u8 q13, $2, $3 //src[0]+src[1]
- vmla.u16 $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
- vaddl.u8 q13, $1, $4 //src[-1]+src[2]
- vmls.s16 $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS
-// { // input:a, b, c, dst_d;
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $0, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $3, $0, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- vext.16 $4, $0, $1, #2 //src[0]
- vext.16 $3, $0, $1, #3 //src[1]
- vadd.s16 $4, $3 //c=src[0]+src[1]
-
- vext.16 $3, $0, $1, #1 //src[-1]
- vext.16 $2, $0, $1, #4 //src[2]
- vadd.s16 $3, $2 //b=src[-1]+src[2]
-
- vext.16 $2, $0, $1, #5 //src[3]
- vadd.s16 $2, $0 //a=src[-2]+src[3]
-// }
-.endm
-
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS
-// { // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
- vext.16 $3, $3, $3, #7 // 0x????, [0][1][2][3][4][5],
- vrev64.16 $1, $1
- vadd.u16 $2, $1 // C[2+3],B[1+4],A[0+5],
- vshr.s64 $1, $2, #16
- vshr.s64 $0, $2, #32 // Output: C $2, B $1, A $0
-
- vsub.s16 $0, $0, $1 //a-b
- vshr.s16 $0, $0, #2 //(a-b)/4
- vsub.s16 $0, $0, $1 //(a-b)/4-b
- vadd.s16 $0, $0, $2 //(a-b)/4-b+c
- vshr.s16 $0, $0, #2 //((a-b)/4-b+c)/4
- vadd.s16 $1, $0, $2 //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- vqrshrun.s16 $0, $3, #6 //(+32)>>6
-// }
-.endm
-#else
.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
// { // input:dst_d, src_d A and B; working: q13
vaddl.u8 q13, \arg2, \arg1
@@ -163,7 +51,7 @@
// }
.endm
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5 // when width=17/9, used
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4 // when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
vrev64.8 \arg2, \arg0 // X[5][4][3][2][1][0]O
vaddl.u8 \arg3, \arg0, \arg2 // each 16bits, *[50][41][32][23][14][05]*
@@ -254,7 +142,6 @@
vqrshrun.s16 \arg0, \arg3, #6 //(+32)>>6
// }
.endm
-#endif
WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
push {r4}
--- a/codec/common/arm64/arm_arch64_common_macro.S
+++ b/codec/common/arm64/arm_arch64_common_macro.S
@@ -32,6 +32,8 @@
#ifdef __APPLE__
+.text
+
.macro WELS_ASM_AARCH64_FUNC_BEGIN
.align 2
.globl _$0
--- a/codec/common/arm64/copy_mb_aarch64_neon.S
+++ b/codec/common/arm64/copy_mb_aarch64_neon.S
@@ -31,83 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.d}[0], [$4], $5
- ld1 {$1.d}[0], [$4], $5
- ld1 {$2.d}[0], [$4], $5
- ld1 {$3.d}[0], [$4], $5
-// }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.d}[0], [$4], $5
- st1 {$1.d}[0], [$4], $5
- st1 {$2.d}[0], [$4], $5
- st1 {$3.d}[0], [$4], $5
-// }
-.endm
-
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.8b}, [$4], $5
- ld1 {$1.8b}, [$4], $5
- ld1 {$2.8b}, [$4], $5
- ld1 {$3.8b}, [$4], $5
-// }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.8b}, [$4], $5
- st1 {$1.8b}, [$4], $5
- st1 {$2.8b}, [$4], $5
- st1 {$3.8b}, [$4], $5
-// }
-.endm
-
-.macro LOAD16_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.2d}, [$4], $5
- ld1 {$1.2d}, [$4], $5
- ld1 {$2.2d}, [$4], $5
- ld1 {$3.2d}, [$4], $5
-// }
-.endm
-
-.macro STORE16_ALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.2d}, [$4], $5
- st1 {$1.2d}, [$4], $5
- st1 {$2.2d}, [$4], $5
- st1 {$3.2d}, [$4], $5
-// }
-.endm
-
-.macro LOAD16_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, src*, src_stride
- ld1 {$0.16b}, [$4], $5
- ld1 {$1.16b}, [$4], $5
- ld1 {$2.16b}, [$4], $5
- ld1 {$3.16b}, [$4], $5
-// }
-.endm
-
-.macro STORE16_UNALIGNED_DATA_WITH_STRIDE
-// { // input: $0~$3, dst*, dst_stride
- st1 {$0.16b}, [$4], $5
- st1 {$1.16b}, [$4], $5
- st1 {$2.16b}, [$4], $5
- st1 {$3.16b}, [$4], $5
-// }
-.endm
-
-#else
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
// { // input: $0~$3, src*, src_stride
ld1 {\arg0\().d}[0], [\arg4], \arg5
@@ -179,8 +104,6 @@
st1 {\arg3\().16b}, [\arg4], \arg5
// }
.endm
-
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsCopy8x8_AArch64_neon
--- a/codec/common/arm64/deblocking_aarch64_neon.S
+++ b/codec/common/arm64/deblocking_aarch64_neon.S
@@ -31,269 +31,9 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro MASK_MATRIX
- uabd $6.16b, $1.16b, $2.16b
- cmhi $6.16b, $4.16b, $6.16b
-
- uabd $4.16b, $0.16b, $1.16b
- cmhi $4.16b, $5.16b, $4.16b
- and $6.16b, $6.16b, $4.16b
-
- uabd $4.16b, $3.16b, $2.16b
- cmhi $4.16b, $5.16b, $4.16b
- and $6.16b, $6.16b, $4.16b
-.endm
-
-.macro DIFF_LUMA_LT4_P1_Q1 //(Use Tmp v23, v24)
- //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
- urhadd $8.16b, $2.16b, $3.16b
- uhadd $8.16b, $0.16b, $8.16b
- usubl $9.8h, $8.8b, $1.8b
- sqxtn $9.8b, $9.8h
- usubl2 $8.8h, $8.16b, $1.16b
- sqxtn2 $9.16b, $8.8h
- smax $8.16b, $9.16b, $5.16b
-//
- smin $8.16b, $8.16b, $6.16b
- uabd $9.16b, $0.16b, $2.16b
- cmhi $9.16b, $4.16b, $9.16b
- and $8.16b, $8.16b, $9.16b
- and $8.16b, $8.16b, $7.16b
- add $8.16b, $1.16b, $8.16b
- abs $9.16b, $9.16b
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0_1
- usubl $5.8h, $0.8b, $3.8b
- usubl $6.8h, $2.8b, $1.8b
- shl $6.8h, $6.8h, #2
- add $5.8h, $5.8h, $6.8h
- sqrshrn $4.8b, $5.8h, #3
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0_2
- usubl2 $5.8h, $0.16b, $3.16b
- usubl2 $6.8h, $2.16b, $1.16b
- shl $6.8h, $6.8h, #2
- add $5.8h, $5.8h, $6.8h
- sqrshrn2 $4.16b, $5.8h, #3
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
- cmge $1.16b, $0.16b, #0
- and $1.16b, $0.16b, $1.16b
- sub $0.16b, $1.16b, $0.16b
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0_1
- uaddl $8.8h, $1.8b, $2.8b
- uaddl $9.8h, $3.8b, $4.8b
- add $9.8h, $9.8h, $8.8h
-
- uaddl $8.8h, $0.8b, $1.8b
- shl $8.8h, $8.8h, #1
- add $8.8h, $9.8h, $8.8h
-
- rshrn $0.8b, $9.8h, #2
- rshrn $7.8b, $8.8h, #3
- shl $9.8h, $9.8h, #1
- usubl $8.8h, $5.8b, $1.8b
- add $9.8h, $8.8h, $9.8h
-
- uaddl $8.8h, $2.8b, $5.8b
- uaddw $8.8h, $8.8h, $2.8b
- uaddw $8.8h, $8.8h, $3.8b
-
- rshrn $9.8b, $9.8h, #3
- rshrn $8.8b, $8.8h, #2
- bsl $6.8b, $9.8b, $8.8b
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0_2
- uaddl2 $8.8h, $1.16b, $2.16b
- uaddl2 $9.8h, $3.16b, $4.16b
- add $9.8h, $9.8h, $8.8h
-
- uaddl2 $8.8h, $0.16b, $1.16b
- shl $8.8h, $8.8h, #1
- add $8.8h, $9.8h, $8.8h
-
- rshrn2 $0.16b, $9.8h, #2
- rshrn2 $7.16b, $8.8h, #3
- shl $9.8h, $9.8h, #1
- usubl2 $8.8h, $5.16b, $1.16b
- add $9.8h, $8.8h, $9.8h
-
- uaddl2 $8.8h, $2.16b, $5.16b
- uaddw2 $8.8h, $8.8h, $2.16b
- uaddw2 $8.8h, $8.8h, $3.16b
-
- rshrn2 $9.16b, $9.8h, #3
- rshrn2 $8.16b, $8.8h, #2
- bsl $6.16b, $9.16b, $8.16b
-.endm
-
-
-.macro DIFF_CHROMA_EQ4_P0Q0_1
- uaddl $4.8h, $0.8b, $3.8b
- shl $4.8h, $4.8h, #1
- usubl $5.8h, $1.8b, $3.8b
- add $5.8h, $5.8h, $4.8h
- rshrn $6.8b, $5.8h, #2
- usubl $5.8h, $2.8b, $0.8b
- add $5.8h, $5.8h, $4.8h
- rshrn $7.8b, $5.8h, #2
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0_2
- uaddl2 $4.8h, $0.16b, $3.16b
- shl $4.8h, $4.8h, #1
- usubl2 $5.8h, $1.16b, $3.16b
- add $5.8h, $5.8h, $4.8h
- rshrn2 $6.16b, $5.8h, #2
- usubl2 $5.8h, $2.16b, $0.16b
- add $5.8h, $5.8h, $4.8h
- rshrn2 $7.16b, $5.8h, #2
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
- mov $3.16b, $2.16b
- bsl $3.16b, $0.16b, $1.16b
-.endm
-
-.macro LOAD_LUMA_DATA_3
- ld3 {$0.b, $1.b, $2.b} [$6], [x2], x1
- ld3 {$3.b, $4.b, $5.b} [$6], [x0], x1
-.endm
-
-.macro LOAD_LUMA_DATA_4
- ld4 {$0.b, $1.b, $2.b, $3.b} [$8], [x3], x1
- ld4 {$4.b, $5.b, $6.b, $7.b} [$8], [x0], x1
-.endm
-
-.macro STORE_LUMA_DATA_4
- st4 {$0.b, $1.b, $2.b, $3.b} [$4], [x0], x1
- st4 {$0.b, $1.b, $2.b, $3.b} [$5], [x2], x1
-.endm
-
-.macro STORE_LUMA_DATA_3
- st3 {$0.b, $1.b, $2.b} [$6], [x3], x1
- st3 {$3.b, $4.b, $5.b} [$6], [x0], x1
-.endm
-
-.macro LOAD_CHROMA_DATA_4
- ld4 {$0.b, $1.b, $2.b, $3.b} [$5], [$4], x2
-.endm
-
-.macro STORE_CHROMA_DATA_2
- st2 {$0.b, $1.b} [$3], [$2], x2
-.endm
-
-.macro ZERO_JUMP_END
- mov $1, $0.d[0]
- mov $2, $0.d[1]
- orr $1, $1, $2
- cbz $1, $3
-.endm
-
-.macro BS_NZC_CHECK
- ld1 {v0.16b}, [$0]
- //Arrange the input data --- TOP
- ands x6, $1, #2
- cbz x6, bs_nzc_check_jump0
- sub x6, $0, $2, lsl #4
- sub x6, x6, $2, lsl #3
- add x6, x6, #12
- ld1 {v1.s} [3], [x6]
-
- bs_nzc_check_jump0:
- ext v1.16b, v1.16b, v0.16b, #12
- add $3.16b, v0.16b, v1.16b
-
- // Arrange the input data --- LEFT
- ands x6, $1, #1
- cbz x6, bs_nzc_check_jump1
-
- sub x6, $0, #21
- add x7, x6, #4
- ld1 {v1.b} [12], [x6]
- add x6, x7, #4
- ld1 {v1.b} [13], [x7]
- add x7, x6, #4
- ld1 {v1.b} [14], [x6]
- ld1 {v1.b} [15], [x7]
-
-bs_nzc_check_jump1:
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ins v2.d[0], v0.d[1]
- zip1 v0.16b, v0.16b, v2.16b
- ext v1.16b, v1.16b, v0.16b, #12
- add $4.16b, v0.16b, v1.16b
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5
- mov w6, #4
- sabd v20.8h, $0.8h, $1.8h
- sabd v21.8h, $1.8h, $2.8h
- dup $0.8h, w6
- sabd v22.8h, $2.8h, $3.8h
- sabd v23.8h, $3.8h, $4.8h
-
- cmge v20.8h, v20.8h, $0.8h
- cmge v21.8h, v21.8h, $0.8h
- cmge v22.8h, v22.8h, $0.8h
- cmge v23.8h, v23.8h, $0.8h
-
- addp v20.8h, v20.8h, v21.8h
- addp v21.8h, v22.8h, v23.8h
-
- addhn $5.8b, v20.8h, v20.8h
- addhn2 $5.16b, v21.8h, v21.8h
-.endm
-
-.macro BS_MV_CHECK
- ldp q0, q1, [$0], #32
- ldp q2, q3, [$0]
- sub $0, $0, #32
- // Arrenge the input data --- TOP
- ands x6, $1, #2
- cbz x6, bs_mv_check_jump0
- sub x6, $0, $2, lsl #6
- add x6, x6, #48
- ld1 {v4.16b}, [x6]
-bs_mv_check_jump0:
- BS_COMPARE_MV v4, v0, v1, v2, v3, $3
- // Arrange the input data --- LEFT
- ands x6, $1, #1
- cbz x6, bs_mv_check_jump1
- sub x6, $0, #52
- add x7, x6, #16
- ld1 {v4.s} [0], [x6]
- add x6, x7, #16
- ld1 {v4.s} [1], [x7]
- add x7, x6, #16
- ld1 {v4.s} [2], [x6]
- ld1 {v4.s} [3], [x7]
-bs_mv_check_jump1:
- zip1 $5.4s, v0.4s, v2.4s
- zip2 $6.4s, v0.4s, v2.4s
- zip1 v0.4s, v1.4s, v3.4s
- zip2 v2.4s, v1.4s, v3.4s
- zip2 v1.4s, $5.4s, v0.4s
- zip1 v0.4s, $5.4s, v0.4s
- zip2 v3.4s, $6.4s, v2.4s
- zip1 v2.4s, $6.4s, v2.4s
- BS_COMPARE_MV v4, v0, v1, v2, v3, $4
-.endm
-
-#else
-
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
uabd \arg6\().16b, \arg1\().16b, \arg2\().16b
cmhi \arg6\().16b, \arg4\().16b, \arg6\().16b
@@ -550,7 +290,6 @@
zip1 v2.4s, \arg6\().4s, v2.4s
BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
mov w1, #1
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN ExpandPictureLuma_AArch64_neon
--- a/codec/common/arm64/intra_pred_common_aarch64_neon.S
+++ b/codec/common/arm64/intra_pred_common_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
//for Luma 16x16
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -31,186 +31,10 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
.align 4
filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
-#ifdef __APPLE__
-
-.macro FILTER_6TAG_8BITS1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun $6.8b, v18.8h, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS2
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 $6.16b, v18.8h, #5
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun $6.8b, v18.8h, #5
- uaddl v19.8h, $2.8b, $6.8b
- rshrn $6.8b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 $6.16b, v18.8h, #5
- uaddl2 v19.8h, $2.16b, $6.16b
- rshrn2 $6.16b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, $2.8b, $3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun $6.8b, v18.8h, #5
- uaddl v19.8h, $3.8b, $6.8b
- rshrn $6.8b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, $2.16b, $3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, $7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, $8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 $6.16b, v18.8h, #5
- uaddl2 v19.8h, $3.16b, $6.16b
- rshrn2 $6.16b, v19.8h, #1
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS1
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl $6.8h, $0.8b, $5.8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, $2.8b, $3.8b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, $1.8b, $4.8b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS2
-// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 $6.8h, $0.16b, $5.16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, $2.16b, $3.16b //src[0]+src[1]
- mla $6.8h, v31.8h, $7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, $1.16b, $4.16b //src[-1]+src[2]
- mls $6.8h, v31.8h, $8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS1
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun $3.8b, $0.8h, #6 //(+32)>>6
-// }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS2
-// { // input:a, b, c, dst_d;
- sub $0.8h, $0.8h, $1.8h //a-b
- sshr $0.8h, $0.8h, #2 //(a-b)/4
- sub $0.8h, $0.8h, $1.8h //(a-b)/4-b
- add $0.8h, $0.8h, $2.8h //(a-b)/4-b+c
- sshr $0.8h, $0.8h, #2 //((a-b)/4-b+c)/4
- add $0.8h, $0.8h, $2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 $3.16b, $0.8h, #6 //(+32)>>6
-// }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext $4.16b, $0.16b, $1.16b, #4 //src[0]
- ext $3.16b, $0.16b, $1.16b, #6 //src[1]
- add $4.8h, $4.8h, $3.8h //c=src[0]+src[1]
-
- ext $3.16b, $0.16b, $1.16b, #2 //src[-1]
- ext $2.16b, $0.16b, $1.16b, #8 //src[2]
- add $3.8h, $3.8h, $2.8h //b=src[-1]+src[2]
-
- ext $2.16b, $0.16b, $1.16b, #10 //src[3]
- add $2.8h, $2.8h, $0.8h //a=src[-2]+src[3]
-// }
-.endm
-
-.macro AVERAGE_TWO_8BITS1
-// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, $2.8b, $1.8b
- rshrn $0.8b, v30.8h, #1
-// }
-.endm
-
-.macro AVERAGE_TWO_8BITS2
-// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, $2.16b, $1.16b
- rshrn2 $0.16b, v30.8h, #1
-// }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS // when width=17/9, used
-// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 $2.8b, $0.8b // X[5][4][3][2][1][0]O
- uaddl $2.8h, $0.8b, $2.8b // each 16bits, *[50][41][32][23][14][05]*
- mul $2.4h, $2.4h, $1.4h // 0+1*[50]-5*[41]+20[32]
- addv $3, $2.4h
- sqrshrun $0.8b, $0.8h, #5
-// }
-.endm
-
-.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
- ext $3.16b, $1.16b, $1.16b, #14 // X[0][1][2][3][4][5]O
- ext $4.16b, $3.16b, $3.16b, #8 // [3][4][5]OX[0][1][2]
- rev64 $4.8h, $4.8h // X[5][4][3][2][1][0]O
- add $3.8h, $3.8h, $4.8h // each 16bits, *[50][41][32][23][14][05]*
- smull $3.4s, $3.4h, $2.4h // 0+1*[50]-5*[41]+20[32]
- saddlv $5, $3.4s
- //sshr $0.2d, $0.2d, #4
- sqrshrun $0.2s, $0.2d, #10
- uqxtn $0.4h, $0.4s
- uqxtn $0.8b, $0.8h
- // }
-.endm
-
-#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
@@ -383,7 +207,6 @@
uqxtn \arg0\().8b, \arg0\().8h
// }
.endm
-#endif
//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -31,43 +31,8 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-#else
-
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
@@ -98,7 +63,6 @@
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
-#endif
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -32,23 +32,9 @@
#ifdef HAVE_NEON
//Global macro
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
//Global macro
-.macro GET_8BYTE_DATA
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
-.endmacro
-#else
-//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -59,7 +45,6 @@
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
-#endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
--- a/codec/decoder/core/arm64/block_add_aarch64_neon.S
+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S
@@ -31,42 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
- ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
- sshr $8.4h, $1.4h, #1
- sshr $9.4h, $3.4h, #1
- ssubl $6.4s, $8.4h, $3.4h //int32 e[i][2] = (src[1]>>1)-src[3];
- saddl $7.4s, $1.4h, $9.4h //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
- add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
- sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
- sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
- sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
- sshr $6.4s, $1.4s, #1
- sshr $7.4s, $3.4s, #1
- sub $6.4s, $6.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- add $7.4s, $1.4s, $7.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
@@ -99,7 +65,6 @@
add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
-#endif
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
--- a/codec/decoder/core/arm64/intra_pred_aarch64_neon.S
+++ b/codec/decoder/core/arm64/intra_pred_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
// for Luma 4x4
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -31,24 +31,10 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
//Global macro
-.macro GET_8BYTE_DATA
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
-.endm
-#else
-//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -59,7 +45,6 @@
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
-#endif
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -31,71 +31,10 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
//The data sequence will be used
-.macro GET_8BYTE_DATA_L0
- vld1.8 {$0[0]}, [$1], $2
- vld1.8 {$0[1]}, [$1], $2
- vld1.8 {$0[2]}, [$1], $2
- vld1.8 {$0[3]}, [$1], $2
- vld1.8 {$0[4]}, [$1], $2
- vld1.8 {$0[5]}, [$1], $2
- vld1.8 {$0[6]}, [$1], $2
- vld1.8 {$0[7]}, [$1], $2
-.endm
-
-
-.macro HDM_TRANSFORM_4X4_L0
-
- //Do the vertical transform
- vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
- vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
- vswp d1, d2
- vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
- vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
-
- //Do the horizontal transform
- vtrn.32 q2, q1
- vadd.s16 q0, q2, q1
- vsub.s16 q1, q2, q1
-
- vtrn.16 q0, q1
- vadd.s16 q2, q0, q1
- vsub.s16 q1, q0, q1
-
- vmov.s16 d0, d4
- vmov.s16 d1, d2
-
- vabs.s16 d3, d3
-
- //16x16_v
- vtrn.32 d0, d1 //{0,1,3,2}
- vaba.s16 $5, d0, $2 //16x16_v
- vaba.s16 $5, d1, $8
- vaba.s16 $5, d5, $8
- vadd.u16 $5, d3
-
- //16x16_h
- vtrn.16 d4, d5 //{0,4,12,8}
- vaba.s16 $6, d4, $3 //16x16_h
- vabs.s16 d2, d2
- vabs.s16 d5, d5
- vadd.u16 d2, d3
- vadd.u16 d2, d5
- vadd.u16 $6, d2
-
- //16x16_dc_both
- vaba.s16 $7, d4, $4 //16x16_dc_both
- vadd.u16 $7, d2
-
-.endm
-
-#else
- //The data sequence will be used
.macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -150,7 +89,6 @@
vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
vadd.u16 \arg7, d2
.endm
-#endif
WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
stmdb sp!, {r4-r7, lr}
--- a/codec/encoder/core/arm/memory_neon.S
+++ b/codec/encoder/core/arm/memory_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
.macro SATD_16x4
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -31,254 +31,8 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-.macro LOAD_4x4_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src1_stride, src2*, src2_stride
- vld2.16 {$0[0],$1[0]}, [$4], $5
- vld2.16 {$2[0],$3[0]}, [$6], $7
- vld2.16 {$0[1],$1[1]}, [$4], $5
- vld2.16 {$2[1],$3[1]}, [$6], $7
-
- vld2.16 {$0[2],$1[2]}, [$4], $5
- vld2.16 {$2[2],$3[2]}, [$6], $7
- vld2.16 {$0[3],$1[3]}, [$4], $5
- vld2.16 {$2[3],$3[3]}, [$6], $7
-// }
-.endm
-
-.macro LOAD_8x8_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- vld1.64 {$0}, [$8], r2
- vld1.64 {$4}, [$9], r4
- vld1.64 {$1}, [$8], r2
- vld1.64 {$5}, [$9], r4
-
- vld1.64 {$2}, [$8], r2
- vld1.64 {$6}, [$9], r4
- vld1.64 {$3}, [$8], r2
- vld1.64 {$7}, [$9], r4
-// }
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-// { // input: src_d[0]~[3], working: [4]~[7]
- vadd.s16 $4, $0, $3 //int16 s[0] = data[i] + data[i3];
- vsub.s16 $7, $0, $3 //int16 s[3] = data[i] - data[i3];
- vadd.s16 $5, $1, $2 //int16 s[1] = data[i1] + data[i2];
- vsub.s16 $6, $1, $2 //int16 s[2] = data[i1] - data[i2];
-
- vadd.s16 $0, $4, $5 //int16 dct[i ] = s[0] + s[1];
- vsub.s16 $2, $4, $5 //int16 dct[i2] = s[0] - s[1];
- vshl.s16 $1, $7, #1
- vshl.s16 $3, $6, #1
- vadd.s16 $1, $1, $6 //int16 dct[i1] = (s[3] << 1) + s[2];
- vsub.s16 $3, $7, $3 //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- vtrn.s16 $0, $1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
- vtrn.s16 $2, $3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
- vtrn.32 $0, $2 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
- vtrn.32 $1, $3 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
-
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
- veor.s16 $6, $6 // init 0 , and keep 0;
- vaba.s16 $1, $0, $6 // f + abs(coef - 0)
- vmull.s16 $7, $2, $4
- vmull.s16 $8, $3, $5
- vshr.s32 $7, #16
- vshr.s32 $8, #16
- vmovn.s32 $2, $7
- vmovn.s32 $3, $8
-
- vcgt.s16 $7, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $6, $1, $7 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $6, #1
- vmax.s16 $9, $2, $3
- vsub.s16 $1, $1, $6 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf , working_d (all 0), working_q
- vaba.s16 $1, $0, $3 // f + abs(coef - 0)
- vmull.s16 $4, $1, $2 // *= mf
- vshr.s32 $4, #16
- vmovn.s32 $1, $4 // >> 16
-
- vcgt.s16 $2, $0, #0 // if true, location of coef == 11111111
- vbif.s16 $3, $1, $2 // if (x<0) reserved part; else keep 0 untouched
- vshl.s16 $3, #1
- vsub.s16 $1, $1, $3 // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro DC_ZERO_COUNT_IN_DUALWORD
-// { // input: coef, dst_d, working_d (all 0x01)
- vceq.s16 $1, $0, #0
- vand.s16 $1, $2
- vpadd.s16 $1, $1, $1
- vpadd.s16 $1, $1, $1
-// }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF
-// { // input: coef_0, coef_1, max_q (identy to follow two)
- vmax.s16 $2, $0, $1 // max 1st in $3 & max 2nd in $4
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1] & max 2nd in $3[2][3]
- vpmax.s16 $3, $3, $4 // max 1st in $3[0][1]
-// }
-.endm
-
-.macro ZERO_COUNT_IN_2_QUARWORD
-// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
- vceq.s16 $0, #0
- vceq.s16 $1, #0
- vand.s16 $0, $2
- vand.s16 $1, $2
-
- vpadd.s16 $3, $3, $5
- vpadd.s16 $4, $4, $6
- vpadd.s16 $3, $3, $4 // 8-->4
- vpadd.s16 $3, $3, $3
- vpadd.s16 $3, $3, $3
-// }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-// { // input: src_d[0]~[3], working_d, dst_d
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- vtrn.s16 $2, $1
- vtrn.s32 $2, $1
-// }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
- vshr.s64 $1, $0, #32
- vadd.s16 $2, $0, $1 // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
- vsub.s16 $1, $0, $1 // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
- vtrn.s16 $2, $1
- vrev32.16 $1, $1
- vtrn.s32 $2, $1 // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
-
- vrev64.16 $1, $2
- vadd.s16 $0, $2, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
- vsub.s16 $1, $2, $1
- vrev32.16 $1, $1 // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
- vtrn.s32 $0, $1 // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-// }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-// { // input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
- vmovl.u8 $4,$0
- vmovl.u8 $5,$1
- vadd.s16 $4,$2
- vadd.s16 $5,$3
- vqmovun.s16 $0,$4
- vqmovun.s16 $1,$5
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- vadd.s16 $4, $0, $2 //int16 e[i][0] = src[0] + src[2];
- vsub.s16 $5, $0, $2 //int16 e[i][1] = src[0] - src[2];
- vshr.s16 $6, $1, #1
- vshr.s16 $7, $3, #1
- vsub.s16 $6, $6, $3 //int16 e[i][2] = (src[1]>>1)-src[3];
- vadd.s16 $7, $1, $7 //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s16 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s16 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s16 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s16 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-
-.macro ROW_TRANSFORM_0_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vsubl.s16 $6, $1, $3 //int32 e[i][2] = src[1] - src[3];
- vaddl.s16 $7, $1, $3 //int32 e[i][3] = src[1] + src[3];
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
- vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
- vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
- vshr.s16 $8, $1, #1
- vshr.s16 $9, $3, #1
- vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
- vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
- vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
- vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
- vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro COL_TRANSFORM_0_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vsub.s32 $6, $1, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $3 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
- vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
- vshr.s32 $6, $1, #1
- vshr.s32 $7, $3, #1
- vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-#else
.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
@@ -522,7 +276,6 @@
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
-#endif
WELS_ASM_FUNC_BEGIN WelsDctT4_neon
--- a/codec/encoder/core/arm/svc_motion_estimation.S
+++ b/codec/encoder/core/arm/svc_motion_estimation.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
--- a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
// for Luma 4x4
--- a/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
.macro LOAD_LUMA_DATA
@@ -94,93 +93,6 @@
trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
.endm
-#ifdef __APPLE__
-.macro SELECT_BEST_COST
- cmp w1, $0
- csel $0, $0, w1, $2
- cset w7, $1
- cmp w2, $0
- mov w6, #2
- csel $0, $0, w2, $2
- csel w7, w7, w6, $2
-.endm
-
-.macro SELECT_BEST_COST_PREFER_HIGHER arg0
- SELECT_BEST_COST \arg0, ls, hi
-.endm
-
-.macro SELECT_BEST_COST_PREFER_LOWER arg0
- SELECT_BEST_COST \arg0, lo, hs
-.endm
-
-.macro LOAD_CHROMA_DATA
- sub x9, $0, x1
- ld1 {$1}, [x9] //top_cb
- sub x9, $0, #1
- ld1 {$2}[8], [x9], x1
- ld1 {$2}[9], [x9], x1
- ld1 {$2}[10], [x9], x1
- ld1 {$2}[11], [x9], x1
- ld1 {$2}[12], [x9], x1
- ld1 {$2}[13], [x9], x1
- ld1 {$2}[14], [x9], x1
- ld1 {$2}[15], [x9], x1 //left_cb
-.endm
-
-.macro LOAD_8X4_DATA
- //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
- ld1 {v0.8b}, [$0], x3
- ld1 {v1.8b}, [$0], x3
- ld1 {v0.d}[1], [$0], x3
- ld1 {v1.d}[1], [$0], x3
- trn1 v2.4s, v0.4s, v1.4s
- trn2 v1.4s, v0.4s, v1.4s
- trn1 v20.2d, v2.2d, v1.2d
- trn2 v21.2d, v2.2d, v1.2d
-.endm
-
-.macro HDM_TRANSFORM_4X4_L0
- //Do the vertical transform
- uadd$9 v0.8h, $0, $1
- usub$9 v1.8h, $0, $1
- trn1 v3.2d, v0.2d, v1.2d
- trn2 v1.2d, v0.2d, v1.2d
- add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
- sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}
-
- //Do the horizontal transform
- trn1 v0.4s, v4.4s, v5.4s
- trn2 v1.4s, v4.4s, v5.4s
- add v4.8h, v0.8h, v1.8h
- sub v5.8h, v0.8h, v1.8h
- trn1 v0.8h, v4.8h, v5.8h
- trn2 v1.8h, v4.8h, v5.8h
- add v4.8h, v0.8h, v1.8h
- sub v5.8h, v0.8h, v1.8h
-
- //16x16_v
- trn1 v0.2s, v4.2s, v5.2s
- trn2 v1.2s, v4.2s, v5.2s
- sabal $5, v0.4h, $2
- sabal $5, v1.4h, $8.4h
- sabal2 $5, v4.8h, $8.8h
- sabal2 $5, v5.8h, $8.8h
-
- //16x16_h
- ins v3.d[0], v4.d[1]
- trn1 v0.4h, v4.4h, v3.4h
- trn2 v1.4h, v4.4h, v3.4h
- sabal $6, v0.4h, $3
- sabdl v4.4s, v1.4h, $8.4h
- sabal v4.4s, v5.4h, $8.4h
- sabal2 v4.4s, v5.8h, $8.8h
- add $6, $6, v4.4s
-
- //16x16_dc_both
- sabal $7, v0.4h, $4
- add $7, $7, v4.4s
-.endm
-#else
.macro SELECT_BEST_COST arg0, arg1, arg2
cmp w1, \arg0
csel \arg0, \arg0, w1, \arg2
@@ -266,7 +178,6 @@
sabal \arg7, v0.4h, \arg4
add \arg7, \arg7, v4.4s
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
ldr x11, [sp, #0]
--- a/codec/encoder/core/arm64/memory_aarch64_neon.S
+++ b/codec/encoder/core/arm64/memory_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
--- a/codec/encoder/core/arm64/pixel_aarch64_neon.S
+++ b/codec/encoder/core/arm64/pixel_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
.macro CALC_AND_STORE_SAD
@@ -69,89 +68,6 @@
ld1 {v7.16b}, [x0], x1
.endm
-#ifdef __APPLE__
-.macro LOAD_8X8_2
- ld1 {v16.8b}, [$0], x3
- ld1 {v17.8b}, [$0], x3
- ld1 {v18.8b}, [$0], x3
- ld1 {v19.8b}, [$0], x3
- ld1 {v20.8b}, [$0], x3
- ld1 {v21.8b}, [$0], x3
- ld1 {v22.8b}, [$0], x3
- ld1 {v23.8b}, [$0], x3
-.endm
-
-.macro CALC_ABS_8X8_1
- uab$1l $0, v0.8b, v16.8b
- uabal $0, v1.8b, v17.8b
- uabal $0, v2.8b, v18.8b
- uabal $0, v3.8b, v19.8b
- uabal $0, v4.8b, v20.8b
- uabal $0, v5.8b, v21.8b
- uabal $0, v6.8b, v22.8b
- uabal $0, v7.8b, v23.8b
-.endm
-
-.macro CALC_ABS_8X8_2
- uab$0l v29.8h, v0.8b, v18.8b
- uabal v29.8h, v1.8b, v19.8b
- uabal v29.8h, v2.8b, v20.8b
- uabal v29.8h, v3.8b, v21.8b
- uabal v29.8h, v4.8b, v22.8b
- uabal v29.8h, v5.8b, v23.8b
- uabal v29.8h, v6.8b, v24.8b
- uabal v29.8h, v7.8b, v25.8b
-.endm
-
-.macro LOAD_16X8_2
- ld1 {v16.16b}, [$0], x3
- ld1 {v17.16b}, [$0], x3
- ld1 {v18.16b}, [$0], x3
- ld1 {v19.16b}, [$0], x3
- ld1 {v20.16b}, [$0], x3
- ld1 {v21.16b}, [$0], x3
- ld1 {v22.16b}, [$0], x3
- ld1 {v23.16b}, [$0], x3
-.endm
-
-.macro CALC_ABS_16X8_1
- uab$1l $0, v0.8b, v16.8b
- uabal2 $0, v0.16b,v16.16b
- uabal $0, v1.8b, v17.8b
- uabal2 $0, v1.16b,v17.16b
- uabal $0, v2.8b, v18.8b
- uabal2 $0, v2.16b,v18.16b
- uabal $0, v3.8b, v19.8b
- uabal2 $0, v3.16b,v19.16b
- uabal $0, v4.8b, v20.8b
- uabal2 $0, v4.16b,v20.16b
- uabal $0, v5.8b, v21.8b
- uabal2 $0, v5.16b,v21.16b
- uabal $0, v6.8b, v22.8b
- uabal2 $0, v6.16b,v22.16b
- uabal $0, v7.8b, v23.8b
- uabal2 $0, v7.16b,v23.16b
-.endm
-
-.macro CALC_ABS_16X8_2
- uab$0l v29.8h, v0.8b, v18.8b
- uabal2 v29.8h, v0.16b,v18.16b
- uabal v29.8h, v1.8b, v19.8b
- uabal2 v29.8h, v1.16b,v19.16b
- uabal v29.8h, v2.8b, v20.8b
- uabal2 v29.8h, v2.16b,v20.16b
- uabal v29.8h, v3.8b, v21.8b
- uabal2 v29.8h, v3.16b,v21.16b
- uabal v29.8h, v4.8b, v22.8b
- uabal2 v29.8h, v4.16b,v22.16b
- uabal v29.8h, v5.8b, v23.8b
- uabal2 v29.8h, v5.16b,v23.16b
- uabal v29.8h, v6.8b, v24.8b
- uabal2 v29.8h, v6.16b,v24.16b
- uabal v29.8h, v7.8b, v25.8b
- uabal2 v29.8h, v7.16b,v25.16b
-.endm
-#else
.macro LOAD_8X8_2 arg0
ld1 {v16.8b}, [\arg0], x3
ld1 {v17.8b}, [\arg0], x3
@@ -233,7 +149,6 @@
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
sxtw x1, w1
--- a/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S
@@ -31,250 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ZERO_COUNT_IN_2_QUARWORD
-// { // input: coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
- cmeq $0.8h, $0.8h, #0
- cmeq $1.8h, $1.8h, #0
- uzp1 $0.16b, $0.16b, $1.16b
- ushr $0.16b, $0.16b, 7
- addv $2, $0.16b
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf
- eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
- saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
- smull $4.4s, $1.4h, $2.4h
- smull2 $5.4s, $1.8h, $2.8h
- shrn $1.4h, $4.4s, #16
- shrn2 $1.8h, $5.4s, #16
-
- cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
- bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
- shl $3.8h, $3.8h, #1
- sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf
- eor $3.16b, $3.16b, $3.16b // init 0 , and keep 0;
- saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
- smull $4.4s, $1.4h, $2.4h
- smull2 $5.4s, $1.8h, $2.8h
- shrn $1.4h, $4.4s, #16
- shrn2 $1.8h, $5.4s, #16
-
- cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
- bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
- shl $3.8h, $3.8h, #1
- mov $6.16b, $1.16b
- sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS // if coef <= 0, - coef; else , coef;
-// { // input: coef, ff (dst), mf
- saba $1.8h, $0.8h, $3.8h // f + abs(coef - 0)
- smull $4.4s, $1.4h, $2.4h
- shrn $1.4h, $4.4s, #16
-
- cmgt $4.8h, $0.8h, #0 // if true, location of coef == 11111111
- bif $3.16b, $1.16b, $4.16b // if (x<0) reserved part; else keep 0 untouched
- shl $3.8h, $3.8h, #1
- sub $1.8h, $1.8h, $3.8h // if x > 0, -= 0; else x-= 2x
-// }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF
-// { // input: coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
- umax $0.8h, $0.8h, $1.8h
- umaxv $4, $0.8h
- umax $2.8h, $2.8h, $3.8h
- umaxv $5, $2.8h
-// }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-// { // input: src_d[0][16][32][48], dst_d[0][16][32][48], working
- sshr $1.2d, $0.2d, #32
- add $2.4h, $0.4h, $1.4h // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
- sub $1.4h, $0.4h, $1.4h // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
- zip1 $1.4h, $2.4h, $1.4h
-// }
-.endm
-
-
-.macro DC_ZERO_COUNT_IN_DUALWORD
-// { // input: coef, dst_d, working_d (all 0x01)
- cmeq $0.4h, $0.4h, #0
- and $0.8b, $0.8b, $2.8b
- addv $1, $0.4h
-// }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS
-// { // input: each src_d[0]~[3](dst), working_q0, working_q1
- uzp2 $1.4s, $0.4s, $0.4s
- uzp1 $0.4s, $0.4s, $0.4s
- add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
- sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
- zip1 $2.8h, $2.8h, $1.8h // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
-
- uzp2 $1.4s, $2.4s, $2.4s
- uzp1 $0.4s, $2.4s, $2.4s
- add $2.8h, $0.8h, $1.8h // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
- sub $1.8h, $0.8h, $1.8h // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
- rev32 $1.4h, $1.4h // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
- zip1 $0.4s, $2.4s, $1.4s
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2
-// { // input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
- uzp1 $2.4s, $0.4s, $1.4s //[0 1 4 5]+[8 9 12 13]
- uzp2 $3.4s, $0.4s, $1.4s //[2 3 6 7]+[10 11 14 15]
-
- uzp1 $0.8h, $2.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
- uzp2 $2.8h, $2.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
- zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
- zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4
-// { // input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
- trn1 $4.8h, v0.8h, v1.8h
- trn2 $5.8h, v0.8h, v1.8h
- trn1 $6.8h, v2.8h, v3.8h
- trn2 $7.8h, v2.8h, v3.8h
-
- trn1 $0.4s, v4.4s, v6.4s
- trn2 $2.4s, v4.4s, v6.4s
- trn1 $1.4s, v5.4s, v7.4s
- trn2 $3.4s, v5.4s, v7.4s
-// }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2
-// { // input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
- mov $0.d[1], $1.d[0] //[0 1 2 3]+[4 5 6 7]
- mov $2.d[1], $3.d[0] //[8 9 10 11]+[12 13 14 15]
- uzp1 $1.4s, $0.4s, $2.4s //[0 1 4 5]+[8 9 12 13]
- uzp2 $3.4s, $0.4s, $2.4s //[2 3 6 7]+[10 11 14 15]
-
- uzp1 $0.8h, $1.8h, $3.8h //[0 4 8 12]+[2 6 10 14]
- uzp2 $2.8h, $1.8h, $3.8h //[1 5 9 13]+[3 7 11 15]
- zip2 $1.2d, $0.2d, $2.2d //[2 6 10 14]+[3 7 11 15]
- zip1 $0.2d, $0.2d, $2.2d //[0 4 8 12]+[1 5 9 13]
-// }
-.endm
-
-.macro LOAD_4x4_DATA_FOR_DCT
- ld1 {$0.s}[0], [$2], $3
- ld1 {$0.s}[1], [$2], $3
- ld1 {$0.s}[2], [$2], $3
- ld1 {$0.s}[3], [$2]
-
- ld1 {$1.s}[0], [$4], $5
- ld1 {$1.s}[1], [$4], $5
- ld1 {$1.s}[2], [$4], $5
- ld1 {$1.s}[3], [$4]
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-// { // input: src_d[0]~[3], working: [4]~[7]
- add $4.8h, $0.8h, $3.8h //int16 s[0] = data[i] + data[i3];
- sub $7.8h, $0.8h, $3.8h //int16 s[3] = data[i] - data[i3];
- add $5.8h, $1.8h, $2.8h //int16 s[1] = data[i1] + data[i2];
- sub $6.8h, $1.8h, $2.8h //int16 s[2] = data[i1] - data[i2];
-
- add $0.8h, $4.8h, $5.8h //int16 dct[i ] = s[0] + s[1];
- sub $2.8h, $4.8h, $5.8h //int16 dct[i2] = s[0] - s[1];
- shl $1.8h, $7.8h, #1
- shl $3.8h, $6.8h, #1
- add $1.8h, $1.8h, $6.8h //int16 dct[i1] = (s[3] << 1) + s[2];
- sub $3.8h, $7.8h, $3.8h //int16 dct[i3] = s[3] - (s[2] << 1);
-// }
-.endm
-
-.macro LOAD_8x4_DATA_FOR_DCT
-// { // input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
- ld1 {$0.d}[0], [$8], x2
- ld1 {$1.d}[0], [$8], x2
- ld1 {$2.d}[0], [$8], x2
- ld1 {$3.d}[0], [$8], x2
-
- ld1 {$4.d}[0], [$9], x4
- ld1 {$5.d}[0], [$9], x4
- ld1 {$6.d}[0], [$9], x4
- ld1 {$7.d}[0], [$9], x4
-// }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-// { // input: src_d[0]~[3], output: e_d[0]~[3];
- add $4.8h, $0.8h, $2.8h //int16 e[i][0] = src[0] + src[2];
- sub $5.8h, $0.8h, $2.8h //int16 e[i][1] = src[0] - src[2];
- sshr $6.8h, $1.8h, #1
- sshr $7.8h, $3.8h, #1
- sub $6.8h, $6.8h, $3.8h //int16 e[i][2] = (src[1]>>1)-src[3];
- add $7.8h, $1.8h, $7.8h //int16 e[i][3] = src[1] + (src[3]>>1);
-// }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- add $0.8h, $4.8h, $7.8h //int16 f[i][0] = e[i][0] + e[i][3];
- add $1.8h, $5.8h, $6.8h //int16 f[i][1] = e[i][1] + e[i][2];
- sub $2.8h, $5.8h, $6.8h //int16 f[i][2] = e[i][1] - e[i][2];
- sub $3.8h, $4.8h, $7.8h //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro ROW_TRANSFORM_0_STEP
-// { // input: src_d[0]~[3], output: e_q[0]~[3];
- saddl $4.4s, $0.4h, $2.4h //int32 e[i][0] = src[0] + src[2];
- ssubl $5.4s, $0.4h, $2.4h //int32 e[i][1] = src[0] - src[2];
- ssubl $6.4s, $1.4h, $3.4h //int32 e[i][2] = src[1] - src[3];
- saddl $7.4s, $1.4h, $3.4h //int32 e[i][3] = src[1] + src[3];
-// }
-.endm
-
-.macro COL_TRANSFORM_0_STEP
-// { // input: src_q[0]~[3], output: e_q[0]~[3];
- add $4.4s, $0.4s, $2.4s //int32 e[0][j] = f[0][j] + f[2][j];
- sub $5.4s, $0.4s, $2.4s //int32 e[1][j] = f[0][j] - f[2][j];
- sub $6.4s, $1.4s, $3.4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
- add $7.4s, $1.4s, $3.4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-// }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-// { // output: f_q[0]~[3], input: e_q[0]~[3];
- add $0.4s, $4.4s, $7.4s //int16 f[i][0] = e[i][0] + e[i][3];
- add $1.4s, $5.4s, $6.4s //int16 f[i][1] = e[i][1] + e[i][2];
- sub $2.4s, $5.4s, $6.4s //int16 f[i][2] = e[i][1] - e[i][2];
- sub $3.4s, $4.4s, $7.4s //int16 f[i][3] = e[i][0] - e[i][3];
-// }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-// { // input: pred_d[0](output), dct_q0/1, working_q0/1;
- uxtl $3.8h, $0.8b
- uxtl2 $4.8h, $0.16b
- add $3.8h, $3.8h, $1.8h
- add $4.8h, $4.8h, $2.8h
- sqxtun $0.8b, $3.8h
- sqxtun2 $0.16b,$4.8h
-// }
-.endm
-#else
.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
// { // input: coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
cmeq \arg0\().8h, \arg0\().8h, #0
@@ -519,7 +277,6 @@
sqxtun2 \arg0\().16b,\arg4\().8h
// }
.endm
-#endif
WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
ld1 {v0.8h, v1.8h}, [x0]
--- a/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
+++ b/codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
@@ -334,4 +333,4 @@
subs x2, x2, #1
cbnz x2, _hash_height_loop
WELS_ASM_AARCH64_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -31,17 +31,8 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-.macro SQR_ADD_16BYTES
- vmull.u8 q3, $0, $0
- vmull.u8 q8, $1, $1
- vpadal.u16 $2, q3
- vpadal.u16 $2, q8
-.endm
-#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
@@ -48,7 +39,6 @@
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
.endm
-#endif
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -31,41 +31,8 @@
*/
#ifdef HAVE_NEON
-.text
#include "arm_arch_common_macro.S"
-#ifdef __APPLE__
-
-.macro ABS_SUB_SUM_16BYTES
- vld1.32 {q15}, [$0], $2
- vld1.32 {q14}, [$1], $2
- vabal.u8 $3, d30, d28
- vabal.u8 $4, d31, d29
-.endm
-
-.macro ABS_SUB_SUM_8x16BYTES
- vld1.32 {q15}, [$0], $2
- vld1.32 {q14}, [$1], $2
- vabdl.u8 $3, d30, d28
- vabdl.u8 $4, d31, d29
-
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
- ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-.endm
-
-.macro SAD_8X16BITS
- vadd.u16 d31, $0, $1
- vpaddl.u16 d31, d31
- vpaddl.u32 $2, d31
-.endm
-
-#else
-
.macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
vld1.32 {q15}, [\arg0], \arg2
vld1.32 {q14}, [\arg1], \arg2
@@ -93,7 +60,6 @@
vpaddl.u16 d31, d31
vpaddl.u32 \arg2, d31
.endm
-#endif
WELS_ASM_FUNC_BEGIN VAACalcSad_neon
@@ -160,52 +126,6 @@
WELS_ASM_FUNC_END
-#ifdef __APPLE__
-.macro SAD_SD_MAD_16BYTES
- vld1.32 {q0}, [$0], $2
- vld1.32 {q1}, [$1], $2
-
- vpadal.u8 $3, q0
- vpadal.u8 $4, q1
-
- vabd.u8 q0, q0, q1
- vmax.u8 $5, q0
- vpadal.u8 $6, q0
-.endm
-
-.macro SAD_SD_MAD_8x16BYTES
- vld1.32 {q0}, [$0], $2
- vld1.32 {q1}, [$1], $2
-
- vpaddl.u8 q2, q0
- vpaddl.u8 q3, q1
-
- vabd.u8 $3, q0, q1
- vpaddl.u8 $4, $3 //abs_diff
-
-
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
- SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-
- vsub.u16 $5, q2, q3
-.endm
-
-.macro SAD_SD_MAD_CALC
- vpmax.u8 d0, $0, $1 //8bytes
- vpmax.u8 d0, d0, d0 //4bytes
- vpmax.u8 $2, d0, d0 //2bytes
-
- vpaddl.u16 $3, $3
- vpaddl.u32 $3, $3
- vpaddl.s16 $4, $4
- vpaddl.s32 $4, $4
-.endm
-#else
.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
vld1.32 {q0}, [\arg0], \arg2
vld1.32 {q1}, [\arg1], \arg2
@@ -250,7 +170,6 @@
vpaddl.s16 \arg4, \arg4
vpaddl.s32 \arg4, \arg4
.endm
-#endif
WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
@@ -314,165 +233,6 @@
WELS_ASM_FUNC_END
-#ifdef __APPLE__
-.macro SSD_MUL_SUM_16BYTES_RESET
- vmull.u8 $3, $0, $0
- vpaddl.u16 $2, $3
-
- vmull.u8 $3, $1, $1
- vpadal.u16 $2, $3
-.endm
-
-.macro SSD_MUL_SUM_16BYTES
- vmull.u8 $3, $0, $0
- vpadal.u16 $2, $3
-
- vmull.u8 $3, $1, $1
- vpadal.u16 $2, $3
-.endm
-
-.macro SAD_SSD_BGD_16
- vld1.8 {q0}, [$0], $2 //load cur_row
-
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
-
- vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
- vld1.8 {q1}, [$1], $2 //load ref_row
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-//the last row of a 16x16 block
-.macro SAD_SSD_BGD_16_end
- vld1.8 {q0}, [$0], $1 //load cur_row
-
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16
-
- vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
-
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-//for the begin of a 8x16 block, use some instructions to reset the register
-.macro SAD_SSD_BGD_16_RESET_8x8
- vld1.8 {q0}, [$0], $2 //load cur_row
-
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
-
- SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16
-
- vld1.8 {q1}, [$1], $2 //load ref_row
-
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-//for the begin of a 16x16 block, use some instructions to reset the register
-.macro SAD_SSD_BGD_16_RESET_16x16
- vld1.8 {q0}, [$0], $2 //load cur_row
- vld1.8 {q1}, [$1], $2 //load ref_row
-
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
-
- vld1.8 {q1}, [$1], $2 //load ref_row
-
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-//for each 8x16 block
-.macro SAD_SSD_BGD_CALC_8x16
-
- vpmax.u8 d10, d10, d11 //4 numbers
- vpmax.u8 d10, d10, d10 //2 numbers
- vpmax.u8 d10, d10, d10 //1 number1
-
- vmov $0, d10 //d26 d27 keeps the l_mad
-
- //p_sd8x8 fix me
- vpaddl.u16 q3, q3
- vpaddl.u16 q4, q4
-
- vsub.i32 $1, q3, q4
- vpaddl.u32 $1, $1
-
- //psad8x8
- vpaddl.u16 $2, $2
- vpaddl.u32 $2, $2
-
- //psadframe
- vadd.i32 q12, $2
-.endm
-
-.macro SAD_SSD_BGD_16x16
- //for one 8x16
- SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
- SAD_SSD_BGD_16 $0, $1, $2, q6
-
- SAD_SSD_BGD_CALC_8x16 d26, q14, q6
-
- //for another 8x16
- SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16 $0, $1, $2, q7
- SAD_SSD_BGD_16_end $0, $2, q7
-
- SAD_SSD_BGD_CALC_8x16 d27, q15, q7
-.endm
-
-.macro SSD_SAD_SD_MAD_PADDL
- vpaddl.s16 $0, $0
- vpaddl.s32 $0, $0
- vadd.i32 $1, $1, $2
-.endm
-#else
.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
vmull.u8 \arg3, \arg0, \arg0
vpaddl.u16 \arg2, \arg3
@@ -630,7 +390,6 @@
vpaddl.s32 \arg0, \arg0
vadd.i32 \arg1, \arg1, \arg2
.endm
-#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
@@ -712,105 +471,6 @@
WELS_ASM_FUNC_END
-#ifdef __APPLE__
-.macro SAD_VAR_16
- vld1.8 {q0}, [$0], $2 //load cur_row
-
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vpadal.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
- vld1.8 {q1}, [$1], $2
-
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-.macro SAD_VAR_16_END
- vld1.8 {q0}, [$0], $1 //load cur_row
-
- vpadal.u8 q3, q0 //add cur_row together
- vpadal.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vpadal.u8 $2, q2 //l_sad for 16 bytes reset for every 8x16
-
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-.macro SAD_VAR_16_RESET_16x16
- vld1.8 {q0}, [$0], $2 //load cur_row
- vld1.8 {q1}, [$1], $2
-
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
- vld1.8 {q1}, [$1], $2
-
- vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
-.endm
-
-.macro SAD_VAR_16_RESET_8x8
- vld1.8 {q0}, [$0], $2 //load cur_row
-
- vpaddl.u8 q3, q0 //add cur_row together
- vpaddl.u8 q4, q1 //add ref_row together
-
- vabd.u8 q2, q0, q1 //abs_diff
-
- vpaddl.u8 $3, q2 //l_sad for 16 bytes reset for every 8x16
-
- vld1.8 {q1}, [$1], $2
-
- vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16
-
- SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16
-.endm
-
-.macro SAD_VAR_16x16
- //for one 8x16
- SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
- SAD_VAR_16 $0, $1, $2, q6
-
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
-
- //for another 8x16
- SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16 $0, $1, $2, q7
- SAD_VAR_16_END $0, $2, q7
-
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
-
- vadd.i32 q12, q7
-.endm
-#else
.macro SAD_VAR_16 arg0, arg1, arg2, arg3
vld1.8 {q0}, [\arg0], \arg2 //load cur_row
@@ -909,7 +569,6 @@
vadd.i32 q12, q7
.endm
-#endif
WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
@@ -971,62 +630,6 @@
WELS_ASM_FUNC_END
-#ifdef __APPLE__
-.macro SAD_SSD_16
- SAD_VAR_16 $0, $1, $2, $3
-
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11
-.endm
-
-.macro SAD_SSD_16_END
- SAD_VAR_16_END $0, $1, $2
-
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
-.endm
-
-.macro SAD_SSD_16_RESET_16x16
- SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
-
- SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
-.endm
-
-.macro SAD_SSD_16_RESET_8x8
- SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
-
- SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16
-.endm
-
-.macro SAD_SSD_16x16
- //for one 8x16
- SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
- SAD_SSD_16 $0, $1, $2, q6
-
- vpaddl.u16 q6, q6
- vpaddl.u32 q6, q6
- vadd.i32 q12, q6
-
- //for another 8x16
- SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16 $0, $1, $2, q7
- SAD_SSD_16_END $0, $2, q7
-
- vpaddl.u16 q7, q7
- vpaddl.u32 q7, q7
-
- vadd.i32 q12, q7
-.endm
-#else
.macro SAD_SSD_16 arg0, arg1, arg2, arg3
SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
@@ -1081,7 +684,6 @@
vadd.i32 q12, q7
.endm
-#endif
WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
--- a/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
+++ b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN SampleVariance16x16_AArch64_neon
ld1 {v1.16b}, [x0], x1 //save the ref data (16bytes)
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon
@@ -223,4 +222,4 @@
WELS_ASM_AARCH64_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm64/pixel_sad_aarch64_neon.S
+++ b/codec/processing/src/arm64/pixel_sad_aarch64_neon.S
@@ -31,7 +31,6 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
WELS_ASM_AARCH64_FUNC_BEGIN WelsProcessingSampleSad8x8_AArch64_neon
@@ -47,4 +46,4 @@
fmov w0, s2
WELS_ASM_AARCH64_FUNC_END
-#endif
\ No newline at end of file
+#endif
--- a/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
+++ b/codec/processing/src/arm64/vaa_calc_aarch64_neon.S
@@ -31,32 +31,8 @@
*/
#ifdef HAVE_NEON_AARCH64
-.text
#include "arm_arch64_common_macro.S"
-#ifdef __APPLE__
-.macro ABS_SUB_SUM_16BYTES
- ld1 {v0.16b}, [x0], x4
- ld1 {v1.16b}, [x1], x4
- uabal $0, v0.8b, v1.8b
- uabal2 $1, v0.16b,v1.16b
-.endm
-
-.macro ABS_SUB_SUM_8x16BYTES
- ld1 {v0.16b}, [x0], x4
- ld1 {v1.16b}, [x1], x4
- uabdl $0, v0.8b, v1.8b
- uabdl2 $1, v0.16b,v1.16b
-
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
- ABS_SUB_SUM_16BYTES $0, $1
-.endm
-#else
.macro ABS_SUB_SUM_16BYTES arg0, arg1
ld1 {v0.16b}, [x0], x4
ld1 {v1.16b}, [x1], x4
@@ -78,7 +54,6 @@
ABS_SUB_SUM_16BYTES \arg0, \arg1
ABS_SUB_SUM_16BYTES \arg0, \arg1
.endm
-#endif
/*
* void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,