ref: c65e2860361c8df6d0223f4c5ec470ceb4862426
parent: f57bb5042ad88215ba50eaa97624424cb46880e5
parent: 66f58e8357dd3ecdbb90330e49f56c62efed12c0
author: volvet <qizh@cisco.com>
date: Fri Apr 25 05:07:43 EDT 2014
Merge pull request #738 from mstorsjo/gnu-aarch64 Fix building the aarch64 assembly using gnu binutils
--- a/codec/common/arm64/arm_arch64_common_macro.S
+++ b/codec/common/arm64/arm_arch64_common_macro.S
@@ -39,11 +39,10 @@
.endm
.macro WELS_ASM_ARCH64_FUNC_END
-ret lr
+ret
.endm
#else
-.syntax unified
.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
.text
@@ -56,7 +55,23 @@
.endm
.macro WELS_ASM_ARCH64_FUNC_END
-ret lr
+ret
.endfunc
+.endm
+
+.macro mov.16b arg0, arg1
+ mov \arg0\().16b, \arg1\().16b
+.endm
+
+.macro mov.8b arg0, arg1
+ mov \arg0\().8b, \arg1\().8b
+.endm
+
+.macro ext.16b arg0, arg1, arg2, arg3
+ ext \arg0\().16b, \arg1\().16b, \arg2\().16b, \arg3
+.endm
+
+.macro ext.8b arg0, arg1, arg2, arg3
+ ext \arg0\().8b, \arg1\().8b, \arg2\().8b, \arg3
.endm
#endif
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -213,148 +213,148 @@
#else
.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun \arg6.8b, v18.8h, #5
+ uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
+ uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
+ mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
+ sqrshrun \arg6\().8b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 \arg6.16b, v18.8h, #5
+ uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
+ mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
+ sqrshrun2 \arg6\().16b, v18.8h, #5
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun \arg6.8b, v18.8h, #5
- uaddl v19.8h, \arg2.8b, \arg6.8b
- rshrn \arg6.8b, v19.8h, #1
+ uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
+ uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
+ mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
+ sqrshrun \arg6\().8b, v18.8h, #5
+ uaddl v19.8h, \arg2\().8b, \arg6\().8b
+ rshrn \arg6\().8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 \arg6.16b, v18.8h, #5
- uaddl2 v19.8h, \arg2.16b, \arg6.16b
- rshrn2 \arg6.16b, v19.8h, #1
+ uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
+ mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
+ sqrshrun2 \arg6\().16b, v18.8h, #5
+ uaddl2 v19.8h, \arg2\().16b, \arg6\().16b
+ rshrn2 \arg6\().16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl v18.8h, \arg0.8b, \arg5.8b //v18=src[-2]+src[3]
- uaddl v19.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
- mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl v19.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
- mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun \arg6.8b, v18.8h, #5
- uaddl v19.8h, \arg3.8b, \arg6.8b
- rshrn \arg6.8b, v19.8h, #1
+ uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
+ uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
+ uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
+ mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
+ sqrshrun \arg6\().8b, v18.8h, #5
+ uaddl v19.8h, \arg3\().8b, \arg6\().8b
+ rshrn \arg6\().8b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
- uaddl2 v18.8h, \arg0.16b, \arg5.16b //v18=src[-2]+src[3]
- uaddl2 v19.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
- mla v18.8h, v19.8h, \arg7.8h //v18 += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v19.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
- mls v18.8h, v19.8h, \arg8.8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
- sqrshrun2 \arg6.16b, v18.8h, #5
- uaddl2 v19.8h, \arg3.16b, \arg6.16b
- rshrn2 \arg6.16b, v19.8h, #1
+ uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
+ uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
+ mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles
+ sqrshrun2 \arg6\().16b, v18.8h, #5
+ uaddl2 v19.8h, \arg3\().16b, \arg6\().16b
+ rshrn2 \arg6\().16b, v19.8h, #1
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl \arg6.8h, \arg0.8b, \arg5.8b //dst_q=src[-2]+src[3]
- uaddl v31.8h, \arg2.8b, \arg3.8b //src[0]+src[1]
- mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl v31.8h, \arg1.8b, \arg4.8b //src[-1]+src[2]
- mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+ uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3]
+ uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1]
+ mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2]
+ mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
- uaddl2 \arg6.8h, \arg0.16b, \arg5.16b //dst_q=src[-2]+src[3]
- uaddl2 v31.8h, \arg2.16b, \arg3.16b //src[0]+src[1]
- mla \arg6.8h, v31.8h, \arg7.8h //dst_q += 20*(src[0]+src[1]), 2 cycles
- uaddl2 v31.8h, \arg1.16b, \arg4.16b //src[-1]+src[2]
- mls \arg6.8h, v31.8h, \arg8.8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+ uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3]
+ uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1]
+ mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles
+ uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2]
+ mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
// { // input:a, b, c, dst_d;
- sub \arg0.8h, \arg0.8h, \arg1.8h //a-b
- sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4
- sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b
- add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c
- sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4
- add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun \arg3.8b, \arg0.8h, #6 //(+32)>>6
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
+ sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
+ sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6
// }
.endm
.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
// { // input:a, b, c, dst_d;
- sub \arg0.8h, \arg0.8h, \arg1.8h //a-b
- sshr \arg0.8h, \arg0.8h, #2 //(a-b)/4
- sub \arg0.8h, \arg0.8h, \arg1.8h //(a-b)/4-b
- add \arg0.8h, \arg0.8h, \arg2.8h //(a-b)/4-b+c
- sshr \arg0.8h, \arg0.8h, #2 //((a-b)/4-b+c)/4
- add \arg0.8h, \arg0.8h, \arg2.8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun2 \arg3.16b, \arg0.8h, #6 //(+32)>>6
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b
+ sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4
+ sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c
+ sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4
+ add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6
// }
.endm
.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
- ext \arg4.16b, \arg0.16b, \arg1.16b, #4 //src[0]
- ext \arg3.16b, \arg0.16b, \arg1.16b, #6 //src[1]
- add \arg4.8h, \arg4.8h, \arg3.8h //c=src[0]+src[1]
+ ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0]
+ ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1]
+ add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1]
- ext \arg3.16b, \arg0.16b, \arg1.16b, #2 //src[-1]
- ext \arg2.16b, \arg0.16b, \arg1.16b, #8 //src[2]
- add \arg3.8h, \arg3.8h, \arg2.8h //b=src[-1]+src[2]
+ ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1]
+ ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2]
+ add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2]
- ext \arg2.16b, \arg0.16b, \arg1.16b, #10 //src[3]
- add \arg2.8h, \arg2.8h, \arg0.8h //a=src[-2]+src[3]
+ ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3]
+ add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3]
// }
.endm
.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
// { // input:dst_d, src_d A and B; working: v5
- uaddl v30.8h, \arg2.8b, \arg1.8b
- rshrn \arg0.8b, v30.8h, #1
+ uaddl v30.8h, \arg2\().8b, \arg1\().8b
+ rshrn \arg0\().8b, v30.8h, #1
// }
.endm
.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
// { // input:dst_d, src_d A and B; working: v5
- uaddl2 v30.8h, \arg2.16b, \arg1.16b
- rshrn2 \arg0.16b, v30.8h, #1
+ uaddl2 v30.8h, \arg2\().16b, \arg1\().16b
+ rshrn2 \arg0\().16b, v30.8h, #1
// }
.endm
@@ -361,26 +361,26 @@
.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
// when width=17/9, used
// { // input: src_d{Y[0][1][2][3][4][5]X},
- rev64 \arg2.8b, \arg0.8b // X[5][4][3][2][1][0]O
- uaddl \arg2.8h, \arg0.8b, \arg2.8b // each 16bits, *[50][41][32][23][14][05]*
- mul \arg2.4h, \arg2.4h, \arg1.4h // 0+1*[50]-5*[41]+20[32]
- addv \arg3, \arg2.4h
- sqrshrun \arg0.8b, \arg0.8h, #5
+ rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O
+ uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]*
+ mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32]
+ addv \arg3, \arg2\().4h
+ sqrshrun \arg0\().8b, \arg0\().8h, #5
// }
.endm
-.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
+.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
- ext.16b \arg3, \arg1, \arg1, #14 // X[0][1][2][3][4][5]O
- ext.16b \arg4, \arg3, \arg3, #8 // [3][4][5]OX[0][1][2]
- rev64 \arg4.8h, \arg4.8h // X[5][4][3][2][1][0]O
- add \arg3.8h, \arg3.8h, \arg4.8h // each 16bits, *[50][41][32][23][14][05]*
- smull \arg3.4s, \arg3.4h, \arg2.4h // 0+1*[50]-5*[41]+20[32]
- saddlv \arg5, \arg3.4s
- //sshr \arg0.2d, \arg0.2d, #4
- sqrshrun \arg0.2s, \arg0.2d, #10
- uqxtn \arg0.4h, \arg0.4s
- uqxtn \arg0.8b, \arg0.8h
+ ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O
+ ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2]
+ rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O
+ add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]*
+ smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32]
+ saddlv \arg5, \arg3\().4s
+ //sshr \arg0\().2d, \arg0\().2d, #4
+ sqrshrun \arg0\().2s, \arg0\().2d, #10
+ uqxtn \arg0\().4h, \arg0\().4s
+ uqxtn \arg0\().8b, \arg0\().8h
// }
.endm
#endif