shithub: openh264

Download patch

ref: 0995390c4a034ad16c83a52ea3a56513aeb6d8a6
parent: d8202cf38f2145a39e22021f70e033cd5dc6401d
author: Martin Storsjö <martin@martin.st>
date: Fri Mar 27 07:04:54 EDT 2015

Remove apple specific versions of arm macros with arguments

The apple assembler for arm can handle the gnu binutils style
macros just fine these days, so there is no need to duplicate all
of these macros in two syntaxes, when the new one works fine in all cases.

We already require a new enough assembler to support the gnu binutils
style features since we use the .rept directive in a few places.

--- a/codec/common/arm/copy_mb_neon.S
+++ b/codec/common/arm/copy_mb_neon.S
@@ -33,43 +33,6 @@
 #ifdef  HAVE_NEON
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
-.macro LOAD_ALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, src*, src_stride
-    vld1.64 {$0}, [$4,:128], $5
-    vld1.64 {$1}, [$4,:128], $5
-    vld1.64 {$2}, [$4,:128], $5
-    vld1.64 {$3}, [$4,:128], $5
-//  }
-.endm
-
-.macro STORE_ALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, dst*, dst_stride
-    vst1.64 {$0}, [$4,:128], $5
-    vst1.64 {$1}, [$4,:128], $5
-    vst1.64 {$2}, [$4,:128], $5
-    vst1.64 {$3}, [$4,:128], $5
-//  }
-.endm
-
-.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, src*, src_stride
-    vld1.64 {$0}, [$4], $5
-    vld1.64 {$1}, [$4], $5
-    vld1.64 {$2}, [$4], $5
-    vld1.64 {$3}, [$4], $5
-//  }
-.endm
-
-.macro STORE_UNALIGNED_DATA_WITH_STRIDE
-//  {   //  input: $0~$3, dst*, dst_stride
-    vst1.64 {$0}, [$4], $5
-    vst1.64 {$1}, [$4], $5
-    vst1.64 {$2}, [$4], $5
-    vst1.64 {$3}, [$4], $5
-//  }
-.endm
-#else
 .macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
 //  {   //  input: \arg0~\arg3, src*, src_stride
     vld1.64 {\arg0}, [\arg4,:128], \arg5
@@ -105,8 +68,6 @@
     vst1.64 {\arg3}, [\arg4], \arg5
 //  }
 .endm
-
-#endif
 
 
 WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -34,125 +34,6 @@
 
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
-.macro JMP_IF_128BITS_IS_ZERO
-    vorr.s16    $2, $0, $1
-    vmov        r3, r2, $2
-    orr         r3, r3, r2
-    cmp         r3, #0
-.endm
-
-.macro MASK_MATRIX
-    vabd.u8 $6, $1, $2
-    vcgt.u8 $6, $4, $6
-
-    vabd.u8 $4, $0, $1
-    vclt.u8 $4, $4, $5
-    vand.u8 $6, $6, $4
-
-    vabd.u8 $4, $3, $2
-    vclt.u8 $4, $4, $5
-    vand.u8 $6, $6, $4
-.endm
-
-
-.macro DIFF_LUMA_LT4_P1_Q1
-    vmov.i8 $9, #128
-    vrhadd.u8   $8, $2, $3
-    vhadd.u8    $8, $0, $8
-    vsub.s8 $8, $8, $9
-    vsub.s8 $9, $1, $9
-    vqsub.s8    $8, $8, $9
-    vmax.s8 $8, $8, $5
-    vmin.s8 $8, $8, $6
-    vabd.u8 $9, $0, $2
-    vclt.u8 $9, $9, $4
-    vand.s8 $8, $8, $9
-    vand.s8 $8, $8, $7
-    vadd.u8 $8, $1, $8
-    vabs.s8 $9, $9
-.endm
-
-.macro DIFF_LUMA_LT4_P0_Q0
-    vsubl.u8    $5, $0, $3
-    vsubl.u8    $6, $2, $1
-    vshl.s16    $6, $6, #2
-    vadd.s16    $5, $5, $6
-    vqrshrn.s16     $4, $5, #3
-.endm
-
-.macro DIFF_LUMA_EQ4_P2P1P0
-    vaddl.u8    q4, $1, $2
-    vaddl.u8    q5, $3, $4
-    vadd.u16    q5, q4, q5
-
-    vaddl.u8    q4, $0, $1
-    vshl.u16    q4, q4, #1
-    vadd.u16    q4, q5, q4
-
-    vrshrn.u16      $0, q5, #2
-    vrshrn.u16      $7, q4, #3
-
-    vshl.u16    q5, q5, #1
-    vsubl.u8    q4, $5, $1
-    vadd.u16    q5, q4,q5
-
-    vaddl.u8    q4, $2, $5
-    vaddw.u8    q4, q4, $2
-    vaddw.u8    q4, q4, $3
-
-    vrshrn.u16      d10,q5, #3
-    vrshrn.u16      d8, q4, #2
-    vbsl.u8     $6, d10, d8
-.endm
-
-.macro DIFF_LUMA_EQ4_MASK
-    vmov    $3, $2
-    vbsl.u8 $3, $0, $1
-.endm
-
-.macro DIFF_CHROMA_EQ4_P0Q0
-    vaddl.u8    $4, $0, $3
-    vaddw.u8    $5, $4, $1
-    vaddw.u8    $6, $4, $2
-    vaddw.u8    $5, $5, $0
-
-    vaddw.u8    $6, $6, $3
-    vrshrn.u16      $7, $5, #2
-    vrshrn.u16      $8, $6, #2
-.endm
-
-.macro LOAD_CHROMA_DATA_4
-    vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro STORE_CHROMA_DATA_4
-    vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro LOAD_LUMA_DATA_3
-    vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
-    vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro STORE_LUMA_DATA_4
-    vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
-    vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
-.endm
-
-.macro STORE_LUMA_DATA_3
-    vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
-    vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro EXTRACT_DELTA_INTO_TWO_PART
-    vcge.s8 $1, $0, #0
-    vand    $1, $0, $1
-    vsub.s8 $0, $1, $0
-.endm
-#else
 .macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
     vorr.s16    \arg2, \arg0, \arg1
     vmov        r3, r2, \arg2
@@ -269,7 +150,6 @@
     vand    \arg1, \arg0, \arg1
     vsub.s8 \arg0, \arg1, \arg0
 .endm
-#endif
 
 WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
     vpush   {q4-q7}
@@ -841,100 +721,6 @@
     vst1.64   {d0,d1,d2}, [r0]
 WELS_ASM_FUNC_END
 
-#ifdef __APPLE__
-.macro BS_NZC_CHECK
-    vld1.8   {d0,d1}, [$0]
-    /* Arrenge the input data --- TOP */
-    ands     r6, $1, #2
-    beq      bs_nzc_check_jump0
-
-    sub      r6, $0, $2, lsl #4
-    sub      r6, r6, $2, lsl #3
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-
-bs_nzc_check_jump0:
-    vext.8   q1, q1, q0, #12
-    vadd.u8  $3, q0, q1
-
-
-    /* Arrenge the input data --- LEFT */
-    ands     r6, $1, #1
-    beq      bs_nzc_check_jump1
-
-    sub      r6, $0, #21
-    add      r7, r6, #4
-    vld1.8   d3[4], [r6]
-    add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-    add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-
-bs_nzc_check_jump1:
-    vzip.8   d0, d1
-    vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-    vadd.u8  $4, q0, q1
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
-    mov       r6, #4
-    vabd.s16  q8, $0, $1
-    vabd.s16  q9, $1, $2
-    vdup.s16  $0, r6
-    vabd.s16  q10, $2, $3
-    vabd.s16  q11, $3, $4
-
-    vcge.s16  q8, $0
-    vcge.s16  q9, $0
-    vcge.s16  q10, $0
-    vcge.s16  q11, $0
-
-    vpadd.i16 d16, d16, d17
-    vpadd.i16 d17, d18, d19
-    vpadd.i16 d18, d20, d21
-    vpadd.i16 d19, d22, d23
-
-    vaddhn.i16  $5, q8, q8
-    vaddhn.i16  $6, q9, q9
-.endm
-
-.macro BS_MV_CHECK
-    vldm   $0, {q0,q1,q2,q3}
-
-    /* Arrenge the input data --- TOP */
-    ands     r6, $1, #2
-    beq      bs_mv_check_jump0
-
-    sub      r6, $0, $2, lsl #6
-    add      r6, #48
-    vld1.8   {d8, d9}, [r6]
-
-bs_mv_check_jump0:
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
-
-    /* Arrenge the input data --- LEFT */
-    ands     r6, $1, #1
-    beq      bs_mv_check_jump1
-
-    sub      r6, $0, #52
-    add      r7, r6, #16
-    vld1.32   d8[0], [r6]
-    add      r6, r7, #16
-    vld1.32   d8[1], [r7]
-    add      r7, r6, #16
-    vld1.32   d9[0], [r6]
-    vld1.32   d9[1], [r7]
-
-bs_mv_check_jump1:
-    vzip.32   q0, q2
-    vzip.32   q1, q3
-    vzip.32   q0, q1
-    vzip.32   q2, q3
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
-.endm
-#else
 .macro BS_NZC_CHECK  arg0, arg1, arg2, arg3, arg4
     vld1.8   {d0,d1}, [\arg0]
     /* Arrenge the input data --- TOP */
@@ -1027,7 +813,6 @@
     vzip.32   q2, q3
     BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg5, \arg6
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -33,117 +33,6 @@
 #ifdef  HAVE_NEON
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
-.macro AVERAGE_TWO_8BITS
-//  {   // input:dst_d, src_d A and B; working: q13
-    vaddl.u8    q13, $2, $1
-    vrshrn.u16      $0, q13, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
-    vaddl.u8    q13, $2, $3 //src[0]+src[1]
-    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
-    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16        $6, q12, #5
-//  }
-.endm
-
-.macro FILTER_SINGLE_TAG_8BITS      // when width=17/9, used
-//  {   // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
-    vrev64.8    $2, $0              // X[5][4][3][2][1][0]O
-    vaddl.u8    $3, $0, $2          // each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16    $0, $2, $1          // 0+1*[50]-5*[41]+20[32]
-    vpadd.s16   $0, $0, $0
-    vpadd.s16   $0, $0, $0
-    vqrshrun.s16    $0, $4, #5
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
-    vaddl.u8    q13, $2, $3 //src[0]+src[1]
-    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
-    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16        $6, q12, #5
-    vaddl.u8    q13, $2, $6
-    vrshrn.u16      $6, q13, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
-    vaddl.u8    q13, $2, $3 //src[0]+src[1]
-    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
-    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16        $6, q12, #5
-    vaddl.u8    q13, $3, $6
-    vrshrn.u16      $6, q13, #1
-//  }
-.endm
-
-.macro FILTER_6TAG_8BITS_TO_16BITS
-//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8    $6, $0, $5      //dst_q=src[-2]+src[3]
-    vaddl.u8    q13, $2, $3 //src[0]+src[1]
-    vmla.u16    $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
-    vmls.s16    $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//  }
-.endm
-
-.macro FILTER_3_IN_16BITS_TO_8BITS
-//  {   // input:a, b, c, dst_d;
-    vsub.s16    $0, $0, $1          //a-b
-    vshr.s16    $0, $0, #2          //(a-b)/4
-    vsub.s16    $0, $0, $1          //(a-b)/4-b
-    vadd.s16    $0, $0, $2          //(a-b)/4-b+c
-    vshr.s16    $0, $0, #2          //((a-b)/4-b+c)/4
-    vadd.s16    $0, $0, $2          //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16    $3, $0, #6      //(+32)>>6
-//  }
-.endm
-
-.macro UNPACK_2_16BITS_TO_ABC
-//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16 $4, $0, $1, #2      //src[0]
-    vext.16 $3, $0, $1, #3      //src[1]
-    vadd.s16    $4, $3                  //c=src[0]+src[1]
-
-    vext.16 $3, $0, $1, #1      //src[-1]
-    vext.16 $2, $0, $1, #4      //src[2]
-    vadd.s16    $3, $2                  //b=src[-1]+src[2]
-
-    vext.16 $2, $0, $1, #5      //src[3]
-    vadd.s16    $2, $0                  //a=src[-2]+src[3]
-//  }
-.endm
-
-.macro UNPACK_1_IN_8x16BITS_TO_8BITS
-//  {   // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16 $3, $3, $3, #7  // 0x????, [0][1][2][3][4][5],
-    vrev64.16   $1, $1
-    vadd.u16    $2, $1              // C[2+3],B[1+4],A[0+5],
-    vshr.s64    $1, $2, #16
-    vshr.s64    $0, $2, #32     // Output: C $2, B $1, A $0
-
-    vsub.s16    $0, $0, $1          //a-b
-    vshr.s16    $0, $0, #2          //(a-b)/4
-    vsub.s16    $0, $0, $1          //(a-b)/4-b
-    vadd.s16    $0, $0, $2          //(a-b)/4-b+c
-    vshr.s16    $0, $0, #2          //((a-b)/4-b+c)/4
-    vadd.s16    $1, $0, $2          //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16    $0, $3, #6      //(+32)>>6
-//  }
-.endm
-#else
 .macro AVERAGE_TWO_8BITS arg0, arg1, arg2
 //  {   // input:dst_d, src_d A and B; working: q13
     vaddl.u8    q13, \arg2, \arg1
@@ -253,7 +142,6 @@
     vqrshrun.s16    \arg0, \arg3, #6        //(+32)>>6
 //  }
 .endm
-#endif
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
     push        {r4}
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -32,41 +32,7 @@
 
 #ifdef HAVE_NEON
 #include "arm_arch_common_macro.S"
-#ifdef __APPLE__
 
-.macro ROW_TRANSFORM_1_STEP
-//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
-    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
-    vshr.s16        $8, $1, #1
-    vshr.s16        $9, $3, #1
-    vsubl.s16       $6, $8, $3          //int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16       $7, $1, $9          //int32 e[i][3] = src[1] + (src[3]>>1);
-//  }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
-//  }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32        $6, $1, #1
-    vshr.s32        $7, $3, #1
-    vsub.s32        $6, $6, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32        $7, $1, $7          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//  }
-.endm
-
-#else
-
 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
 //  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
     vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
@@ -97,7 +63,6 @@
     vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //  }
 .endm
-#endif
 
 //  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -34,20 +34,7 @@
 //Global macro
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
 //Global macro
-.macro GET_8BYTE_DATA
-    vld1.8 {$0[0]}, [$1], $2
-    vld1.8 {$0[1]}, [$1], $2
-    vld1.8 {$0[2]}, [$1], $2
-    vld1.8 {$0[3]}, [$1], $2
-    vld1.8 {$0[4]}, [$1], $2
-    vld1.8 {$0[5]}, [$1], $2
-    vld1.8 {$0[6]}, [$1], $2
-    vld1.8 {$0[7]}, [$1], $2
-.endmacro
-#else
-//Global macro
 .macro GET_8BYTE_DATA arg0, arg1, arg2
     vld1.8 {\arg0[0]}, [\arg1], \arg2
     vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -58,7 +45,6 @@
     vld1.8 {\arg0[6]}, [\arg1], \arg2
     vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -34,20 +34,7 @@
 
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
 //Global macro
-.macro GET_8BYTE_DATA
-    vld1.8 {$0[0]}, [$1], $2
-    vld1.8 {$0[1]}, [$1], $2
-    vld1.8 {$0[2]}, [$1], $2
-    vld1.8 {$0[3]}, [$1], $2
-    vld1.8 {$0[4]}, [$1], $2
-    vld1.8 {$0[5]}, [$1], $2
-    vld1.8 {$0[6]}, [$1], $2
-    vld1.8 {$0[7]}, [$1], $2
-.endm
-#else
-//Global macro
 .macro GET_8BYTE_DATA arg0, arg1, arg2
     vld1.8 {\arg0[0]}, [\arg1], \arg2
     vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -58,7 +45,6 @@
     vld1.8 {\arg0[6]}, [\arg1], \arg2
     vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -34,67 +34,7 @@
 #include "arm_arch_common_macro.S"
 
 
-#ifdef __APPLE__
  //The data sequence will be used
-.macro GET_8BYTE_DATA_L0
-    vld1.8 {$0[0]}, [$1], $2
-    vld1.8 {$0[1]}, [$1], $2
-    vld1.8 {$0[2]}, [$1], $2
-    vld1.8 {$0[3]}, [$1], $2
-    vld1.8 {$0[4]}, [$1], $2
-    vld1.8 {$0[5]}, [$1], $2
-    vld1.8 {$0[6]}, [$1], $2
-    vld1.8 {$0[7]}, [$1], $2
-.endm
-
-
-.macro HDM_TRANSFORM_4X4_L0
-
-    //Do the vertical transform
-    vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
-    vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
-    vswp  d1, d2
-    vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
-    vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
-
-    //Do the horizontal transform
-    vtrn.32 q2, q1
-    vadd.s16 q0, q2, q1
-    vsub.s16 q1, q2, q1
-
-    vtrn.16 q0, q1
-    vadd.s16 q2, q0, q1
-    vsub.s16 q1, q0, q1
-
-    vmov.s16 d0, d4
-    vmov.s16 d1, d2
-
-    vabs.s16 d3, d3
-
-    //16x16_v
-    vtrn.32 d0, d1 //{0,1,3,2}
-    vaba.s16 $5, d0, $2 //16x16_v
-    vaba.s16 $5, d1, $8
-    vaba.s16 $5, d5, $8
-    vadd.u16 $5, d3
-
-    //16x16_h
-    vtrn.16 d4, d5 //{0,4,12,8}
-    vaba.s16 $6, d4, $3 //16x16_h
-    vabs.s16 d2, d2
-    vabs.s16 d5, d5
-    vadd.u16 d2, d3
-    vadd.u16 d2, d5
-    vadd.u16 $6, d2
-
-    //16x16_dc_both
-    vaba.s16 $7, d4, $4 //16x16_dc_both
-    vadd.u16 $7, d2
-
-.endm
-
-#else
- //The data sequence will be used
 .macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
     vld1.8 {\arg0[0]}, [\arg1], \arg2
     vld1.8 {\arg0[1]}, [\arg1], \arg2
@@ -149,7 +89,6 @@
     vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
     vadd.u16 \arg7, d2
 .endm
-#endif
 
 WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Satd_neon
     stmdb sp!, {r4-r7, lr}
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -33,251 +33,6 @@
 #ifdef  HAVE_NEON
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
-.macro LOAD_4x4_DATA_FOR_DCT
-//  {   //  input: $0~$3, src1*, src1_stride, src2*, src2_stride
-    vld2.16 {$0[0],$1[0]}, [$4], $5
-    vld2.16 {$2[0],$3[0]}, [$6], $7
-    vld2.16 {$0[1],$1[1]}, [$4], $5
-    vld2.16 {$2[1],$3[1]}, [$6], $7
-
-    vld2.16 {$0[2],$1[2]}, [$4], $5
-    vld2.16 {$2[2],$3[2]}, [$6], $7
-    vld2.16 {$0[3],$1[3]}, [$4], $5
-    vld2.16 {$2[3],$3[3]}, [$6], $7
-//  }
-.endm
-
-.macro LOAD_8x8_DATA_FOR_DCT
-//  {   //  input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-    vld1.64 {$0}, [$8], r2
-    vld1.64 {$4}, [$9], r4
-    vld1.64 {$1}, [$8], r2
-    vld1.64 {$5}, [$9], r4
-
-    vld1.64 {$2}, [$8], r2
-    vld1.64 {$6}, [$9], r4
-    vld1.64 {$3}, [$8], r2
-    vld1.64 {$7}, [$9], r4
-//  }
-.endm
-
-.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-//  {   //  input: src_d[0]~[3], working: [4]~[7]
-    vadd.s16        $4, $0, $3          //int16 s[0] = data[i] + data[i3];
-    vsub.s16        $7, $0, $3          //int16 s[3] = data[i] - data[i3];
-    vadd.s16        $5, $1, $2          //int16 s[1] = data[i1] + data[i2];
-    vsub.s16        $6, $1, $2          //int16 s[2] = data[i1] - data[i2];
-
-    vadd.s16        $0, $4, $5          //int16 dct[i ] = s[0] + s[1];
-    vsub.s16        $2, $4, $5          //int16 dct[i2] = s[0] - s[1];
-    vshl.s16        $1, $7, #1
-    vshl.s16        $3, $6, #1
-    vadd.s16        $1, $1, $6          //int16 dct[i1] = (s[3] << 1) + s[2];
-    vsub.s16        $3, $7, $3          //int16 dct[i3] = s[3] - (s[2] << 1);
-//  }
-.endm
-
-.macro MATRIX_TRANSFORM_EACH_16BITS
-//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-    vtrn.s16        $0, $1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-    vtrn.s16        $2, $3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-    vtrn.32     $0, $2              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-    vtrn.32     $1, $3              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//  }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS    // if coef <= 0, - coef; else , coef;
-//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-    veor.s16        $6, $6          // init 0 , and keep 0;
-    vaba.s16        $1, $0, $6      // f + abs(coef - 0)
-    vmull.s16       $7, $2, $4
-    vmull.s16       $8, $3, $5
-    vshr.s32        $7, #16
-    vshr.s32        $8, #16
-    vmovn.s32       $2, $7
-    vmovn.s32       $3, $8
-
-    vcgt.s16        $7, $0, #0      // if true, location of coef == 11111111
-    vbif.s16        $6, $1, $7      // if (x<0) reserved part; else keep 0 untouched
-    vshl.s16        $6, #1
-    vsub.s16        $1, $1, $6      // if x > 0, -= 0; else x-= 2x
-//  }
-.endm
-
-.macro NEWQUANT_COEF_EACH_16BITS_MAX    // if coef <= 0, - coef; else , coef;
-//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-    veor.s16        $6, $6          // init 0 , and keep 0;
-    vaba.s16        $1, $0, $6      // f + abs(coef - 0)
-    vmull.s16       $7, $2, $4
-    vmull.s16       $8, $3, $5
-    vshr.s32        $7, #16
-    vshr.s32        $8, #16
-    vmovn.s32       $2, $7
-    vmovn.s32       $3, $8
-
-    vcgt.s16        $7, $0, #0      // if true, location of coef == 11111111
-    vbif.s16        $6, $1, $7      // if (x<0) reserved part; else keep 0 untouched
-    vshl.s16        $6, #1
-    vmax.s16        $9, $2, $3
-    vsub.s16        $1, $1, $6      // if x > 0, -= 0; else x-= 2x
-//  }
-.endm
-
-.macro QUANT_DUALWORD_COEF_EACH_16BITS  // if coef <= 0, - coef; else , coef;
-//  {   //  input:  coef, ff (dst), mf , working_d (all 0), working_q
-    vaba.s16        $1, $0, $3      // f + abs(coef - 0)
-    vmull.s16       $4, $1, $2      // *= mf
-    vshr.s32        $4, #16
-    vmovn.s32       $1, $4          // >> 16
-
-    vcgt.s16        $2, $0, #0      // if true, location of coef == 11111111
-    vbif.s16        $3, $1, $2      // if (x<0) reserved part; else keep 0 untouched
-    vshl.s16        $3, #1
-    vsub.s16        $1, $1, $3      // if x > 0, -= 0; else x-= 2x
-//  }
-.endm
-
-.macro DC_ZERO_COUNT_IN_DUALWORD
-//  {   //  input:  coef, dst_d, working_d (all 0x01)
-    vceq.s16    $1, $0, #0
-    vand.s16    $1, $2
-    vpadd.s16   $1, $1, $1
-    vpadd.s16   $1, $1, $1
-//  }
-.endm
-
-.macro SELECT_MAX_IN_ABS_COEF
-//  {   //  input:  coef_0, coef_1, max_q (identy to follow two)
-    vmax.s16        $2, $0, $1      // max 1st in $3 & max 2nd in $4
-    vpmax.s16       $3, $3, $4      // max 1st in $3[0][1] & max 2nd in $3[2][3]
-    vpmax.s16       $3, $3, $4      // max 1st in $3[0][1]
-//  }
-.endm
-
-.macro ZERO_COUNT_IN_2_QUARWORD
-//  {   //  input:  coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
-    vceq.s16    $0, #0
-    vceq.s16    $1, #0
-    vand.s16    $0, $2
-    vand.s16    $1, $2
-
-    vpadd.s16   $3, $3, $5
-    vpadd.s16   $4, $4, $6
-    vpadd.s16   $3, $3, $4      // 8-->4
-    vpadd.s16   $3, $3, $3
-    vpadd.s16   $3, $3, $3
-//  }
-.endm
-
-.macro HDM_QUANT_2x2_TOTAL_16BITS
-//  {   //  input: src_d[0]~[3], working_d, dst_d
-    vshr.s64    $1, $0, #32
-    vadd.s16    $2, $0, $1      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-    vsub.s16    $1, $0, $1      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-    vtrn.s16    $2, $1
-    vtrn.s32    $2, $1
-//  }
-.endm
-
-.macro IHDM_4x4_TOTAL_16BITS
-//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-    vshr.s64    $1, $0, #32
-    vadd.s16    $2, $0, $1      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-    vsub.s16    $1, $0, $1      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-    vtrn.s16    $2, $1
-    vrev32.16   $1, $1
-    vtrn.s32    $2, $1          // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
-
-    vrev64.16   $1, $2
-    vadd.s16    $0, $2, $1      // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-    vsub.s16    $1, $2, $1
-    vrev32.16   $1, $1          // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-    vtrn.s32    $0, $1          // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//  }
-.endm
-
-.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-//  {   //  input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-    vmovl.u8        $4,$0
-    vmovl.u8        $5,$1
-    vadd.s16        $4,$2
-    vadd.s16        $5,$3
-    vqmovun.s16 $0,$4
-    vqmovun.s16 $1,$5
-//  }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
-    vadd.s16        $4, $0, $2          //int16 e[i][0] = src[0] + src[2];
-    vsub.s16        $5, $0, $2          //int16 e[i][1] = src[0] - src[2];
-    vshr.s16        $6, $1, #1
-    vshr.s16        $7, $3, #1
-    vsub.s16        $6, $6, $3          //int16 e[i][2] = (src[1]>>1)-src[3];
-    vadd.s16        $7, $1, $7          //int16 e[i][3] = src[1] + (src[3]>>1);
-//  }
-.endm
-
-.macro TRANSFORM_TOTAL_16BITS   // both row & col transform used
-//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s16        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s16        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s16        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s16        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
-//  }
-.endm
-
-
-.macro ROW_TRANSFORM_0_STEP
-//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
-    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
-    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
-    vsubl.s16       $6, $1, $3          //int32 e[i][2] = src[1] - src[3];
-    vaddl.s16       $7, $1, $3          //int32 e[i][3] = src[1] + src[3];
-//  }
-.endm
-
-.macro ROW_TRANSFORM_1_STEP
-//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
-    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
-    vshr.s16        $8, $1, #1
-    vshr.s16        $9, $3, #1
-    vsubl.s16       $6, $8, $3          //int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16       $7, $1, $9          //int32 e[i][3] = src[1] + (src[3]>>1);
-//  }
-.endm
-
-.macro TRANSFORM_4BYTES // both row & col transform used
-//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
-//  }
-.endm
-
-.macro COL_TRANSFORM_0_STEP
-//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
-    vsub.s32        $6, $1, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32        $7, $1, $3          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//  }
-.endm
-
-.macro COL_TRANSFORM_1_STEP
-//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32        $6, $1, #1
-    vshr.s32        $7, $3, #1
-    vsub.s32        $6, $6, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32        $7, $1, $7          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//  }
-.endm
-#else
 .macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
 //  {   //  input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
     vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
@@ -521,7 +276,6 @@
     vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
 //  }
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN WelsDctT4_neon
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -33,14 +33,6 @@
 #ifdef HAVE_NEON
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
-.macro SQR_ADD_16BYTES
-    vmull.u8 q3, $0, $0
-    vmull.u8 q8, $1, $1
-    vpadal.u16 $2, q3
-    vpadal.u16 $2, q8
-.endm
-#else
 .macro SQR_ADD_16BYTES arg0, arg1, arg2
     vmull.u8 q3, \arg0, \arg0
     vmull.u8 q8, \arg1, \arg1
@@ -47,7 +39,6 @@
     vpadal.u16 \arg2, q3
     vpadal.u16 \arg2, q8
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -33,38 +33,6 @@
 #ifdef HAVE_NEON
 #include "arm_arch_common_macro.S"
 
-#ifdef __APPLE__
-
-.macro ABS_SUB_SUM_16BYTES
-    vld1.32 {q15}, [$0], $2
-    vld1.32 {q14}, [$1], $2
-    vabal.u8 $3, d30, d28
-    vabal.u8 $4, d31, d29
-.endm
-
-.macro ABS_SUB_SUM_8x16BYTES
-    vld1.32 {q15}, [$0], $2
-    vld1.32 {q14}, [$1], $2
-    vabdl.u8 $3, d30, d28
-    vabdl.u8 $4, d31, d29
-
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-.endm
-
-.macro SAD_8X16BITS
-    vadd.u16 d31, $0, $1
-    vpaddl.u16 d31, d31
-    vpaddl.u32 $2, d31
-.endm
-
-#else
-
 .macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
     vld1.32 {q15}, [\arg0], \arg2
     vld1.32 {q14}, [\arg1], \arg2
@@ -92,7 +60,6 @@
     vpaddl.u16 d31, d31
     vpaddl.u32 \arg2, d31
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSad_neon
@@ -159,52 +126,6 @@
 WELS_ASM_FUNC_END
 
 
-#ifdef __APPLE__
-.macro SAD_SD_MAD_16BYTES
-    vld1.32 {q0}, [$0], $2
-    vld1.32 {q1}, [$1], $2
-
-    vpadal.u8 $3, q0
-    vpadal.u8 $4, q1
-
-    vabd.u8 q0, q0, q1
-    vmax.u8 $5, q0
-    vpadal.u8 $6, q0
-.endm
-
-.macro SAD_SD_MAD_8x16BYTES
-    vld1.32 {q0}, [$0], $2
-    vld1.32 {q1}, [$1], $2
-
-    vpaddl.u8 q2, q0
-    vpaddl.u8 q3, q1
-
-    vabd.u8 $3, q0, q1
-    vpaddl.u8 $4, $3       //abs_diff
-
-
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-    SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
-
-    vsub.u16 $5, q2, q3
-.endm
-
-.macro SAD_SD_MAD_CALC
-    vpmax.u8 d0, $0, $1 //8bytes
-    vpmax.u8 d0, d0, d0 //4bytes
-    vpmax.u8 $2, d0, d0 //2bytes
-
-    vpaddl.u16 $3, $3
-    vpaddl.u32 $3, $3
-    vpaddl.s16 $4, $4
-    vpaddl.s32 $4, $4
-.endm
-#else
 .macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
     vld1.32 {q0}, [\arg0], \arg2
     vld1.32 {q1}, [\arg1], \arg2
@@ -249,7 +170,6 @@
     vpaddl.s16 \arg4, \arg4
     vpaddl.s32 \arg4, \arg4
 .endm
-#endif
 
 WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
 
@@ -313,165 +233,6 @@
 WELS_ASM_FUNC_END
 
 
-#ifdef __APPLE__
-.macro SSD_MUL_SUM_16BYTES_RESET
-    vmull.u8 $3, $0, $0
-    vpaddl.u16 $2, $3
-
-    vmull.u8 $3, $1, $1
-    vpadal.u16 $2, $3
-.endm
-
-.macro SSD_MUL_SUM_16BYTES
-    vmull.u8 $3, $0, $0
-    vpadal.u16 $2, $3
-
-    vmull.u8 $3, $1, $1
-    vpadal.u16 $2, $3
-.endm
-
-.macro SAD_SSD_BGD_16
-    vld1.8 {q0}, [$0], $2 //load cur_row
-
-    vpadal.u8 q3, q0    //add cur_row together
-    vpadal.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
-
-    vpadal.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
-
-    vld1.8 {q1}, [$1], $2 //load ref_row
-    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-//the last row of a 16x16 block
-.macro SAD_SSD_BGD_16_end
-    vld1.8 {q0}, [$0], $1 //load cur_row
-
-    vpadal.u8 q3, q0    //add cur_row together
-    vpadal.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
-
-    vpadal.u8 $2, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
-
-    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-//for the begin of a 8x16 block, use some instructions to reset the register
-.macro SAD_SSD_BGD_16_RESET_8x8
-    vld1.8 {q0}, [$0], $2 //load cur_row
-
-    vpaddl.u8 q3, q0    //add cur_row together
-    vpaddl.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
-    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
-
-
-    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
-
-    vld1.8 {q1}, [$1], $2 //load ref_row
-
-    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-//for the begin of a 16x16 block, use some instructions to reset the register
-.macro SAD_SSD_BGD_16_RESET_16x16
-    vld1.8 {q0}, [$0], $2 //load cur_row
-    vld1.8 {q1}, [$1], $2 //load ref_row
-
-    vpaddl.u8 q3, q0    //add cur_row together
-    vpaddl.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
-
-    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
-
-    vld1.8 {q1}, [$1], $2 //load ref_row
-
-    vpaddl.u8 q9, q0                                //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-//for each 8x16 block
-.macro SAD_SSD_BGD_CALC_8x16
-
-    vpmax.u8 d10, d10, d11 //4 numbers
-    vpmax.u8 d10, d10, d10 //2 numbers
-    vpmax.u8 d10, d10, d10 //1 number1
-
-    vmov $0, d10            //d26 d27 keeps the l_mad
-
-    //p_sd8x8           fix me
-    vpaddl.u16 q3, q3
-    vpaddl.u16 q4, q4
-
-    vsub.i32 $1, q3, q4
-    vpaddl.u32 $1, $1
-
-    //psad8x8
-    vpaddl.u16 $2, $2
-    vpaddl.u32 $2, $2
-
-    //psadframe
-    vadd.i32 q12, $2
-.endm
-
-.macro SAD_SSD_BGD_16x16
-    //for one 8x16
-    SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-    SAD_SSD_BGD_16 $0, $1, $2, q6
-
-    SAD_SSD_BGD_CALC_8x16 d26, q14, q6
-
-    //for another 8x16
-    SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
-    SAD_SSD_BGD_16 $0, $1, $2, q7
-    SAD_SSD_BGD_16 $0, $1, $2, q7
-    SAD_SSD_BGD_16 $0, $1, $2, q7
-    SAD_SSD_BGD_16 $0, $1, $2, q7
-    SAD_SSD_BGD_16 $0, $1, $2, q7
-    SAD_SSD_BGD_16 $0, $1, $2, q7
-    SAD_SSD_BGD_16_end $0, $2, q7
-
-    SAD_SSD_BGD_CALC_8x16 d27, q15, q7
-.endm
-
-.macro SSD_SAD_SD_MAD_PADDL
-    vpaddl.s16 $0, $0
-    vpaddl.s32 $0, $0
-    vadd.i32 $1, $1, $2
-.endm
-#else
 .macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
     vmull.u8   \arg3, \arg0, \arg0
     vpaddl.u16 \arg2, \arg3
@@ -629,7 +390,6 @@
     vpaddl.s32 \arg0, \arg0
     vadd.i32 \arg1, \arg1, \arg2
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
@@ -711,105 +471,6 @@
 WELS_ASM_FUNC_END
 
 
-#ifdef __APPLE__
-.macro SAD_VAR_16
-    vld1.8 {q0}, [$0], $2 //load cur_row
-
-    vpadal.u8 q3, q0    //add cur_row together
-    vpadal.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vpadal.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    vld1.8 {q1}, [$1], $2
-
-    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-.macro SAD_VAR_16_END
-    vld1.8 {q0}, [$0], $1 //load cur_row
-
-    vpadal.u8 q3, q0    //add cur_row together
-    vpadal.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vpadal.u8 $2, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-.macro SAD_VAR_16_RESET_16x16
-    vld1.8 {q0}, [$0], $2 //load cur_row
-    vld1.8 {q1}, [$1], $2
-
-    vpaddl.u8 q3, q0    //add cur_row together
-    vpaddl.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    vld1.8 {q1}, [$1], $2
-
-    vpaddl.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
-.endm
-
-.macro SAD_VAR_16_RESET_8x8
-    vld1.8 {q0}, [$0], $2 //load cur_row
-
-    vpaddl.u8 q3, q0    //add cur_row together
-    vpaddl.u8 q4, q1    //add ref_row together
-
-    vabd.u8 q2, q0, q1  //abs_diff
-
-    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
-
-    vld1.8 {q1}, [$1], $2
-
-    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
-
-    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
-.endm
-
-.macro SAD_VAR_16x16
-    //for one 8x16
-    SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-    SAD_VAR_16 $0, $1, $2, q6
-
-    vpaddl.u16 q6, q6
-    vpaddl.u32 q6, q6
-    vadd.i32 q12, q6
-
-    //for another 8x16
-    SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
-    SAD_VAR_16 $0, $1, $2, q7
-    SAD_VAR_16 $0, $1, $2, q7
-    SAD_VAR_16 $0, $1, $2, q7
-    SAD_VAR_16 $0, $1, $2, q7
-    SAD_VAR_16 $0, $1, $2, q7
-    SAD_VAR_16 $0, $1, $2, q7
-    SAD_VAR_16_END $0, $2, q7
-
-    vpaddl.u16 q7, q7
-    vpaddl.u32 q7, q7
-
-    vadd.i32 q12, q7
-.endm
-#else
 .macro SAD_VAR_16 arg0, arg1, arg2, arg3
     vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
@@ -908,7 +569,6 @@
 
     vadd.i32 q12, q7
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
@@ -970,62 +630,6 @@
 WELS_ASM_FUNC_END
 
 
-#ifdef __APPLE__
-.macro SAD_SSD_16
-    SAD_VAR_16 $0, $1, $2, $3
-
-    SSD_MUL_SUM_16BYTES d4,d5,q8, q11
-.endm
-
-.macro SAD_SSD_16_END
-    SAD_VAR_16_END $0, $1, $2
-
-    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
-.endm
-
-.macro SAD_SSD_16_RESET_16x16
-    SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
-
-    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
-.endm
-
-.macro SAD_SSD_16_RESET_8x8
-    SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
-
-    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
-.endm
-
-.macro SAD_SSD_16x16
-    //for one 8x16
-    SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-    SAD_SSD_16 $0, $1, $2, q6
-
-    vpaddl.u16 q6, q6
-    vpaddl.u32 q6, q6
-    vadd.i32 q12, q6
-
-    //for another 8x16
-    SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
-    SAD_SSD_16 $0, $1, $2, q7
-    SAD_SSD_16 $0, $1, $2, q7
-    SAD_SSD_16 $0, $1, $2, q7
-    SAD_SSD_16 $0, $1, $2, q7
-    SAD_SSD_16 $0, $1, $2, q7
-    SAD_SSD_16 $0, $1, $2, q7
-    SAD_SSD_16_END $0, $2, q7
-
-    vpaddl.u16 q7, q7
-    vpaddl.u32 q7, q7
-
-    vadd.i32 q12, q7
-.endm
-#else
 .macro SAD_SSD_16 arg0, arg1, arg2, arg3
     SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
 
@@ -1080,7 +684,6 @@
 
     vadd.i32 q12, q7
 .endm
-#endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon