ref: 60f36eb25a9b30499aedc33e63c82ae3c15e277a
dir: /codec/encoder/core/arm64/pixel_aarch64_neon.S/
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
.macro CALC_AND_STORE_SAD
saddlv s2, v2.8h
fmov w0, s2
.endm
.macro CALC_AND_STORE_SAD_FOUR
saddlv s28, v28.8h
saddlv s29, v29.8h
saddlv s30, v30.8h
saddlv s31, v31.8h
st4 {v28.s, v29.s, v30.s, v31.s}[0], [x4]
.endm
.macro LOAD_8X8_1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
.endm
.macro LOAD_16X8_1
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x0], x1
ld1 {v2.16b}, [x0], x1
ld1 {v3.16b}, [x0], x1
ld1 {v4.16b}, [x0], x1
ld1 {v5.16b}, [x0], x1
ld1 {v6.16b}, [x0], x1
ld1 {v7.16b}, [x0], x1
.endm
.macro LOAD_8X8_2 arg0
ld1 {v16.8b}, [\arg0], x3
ld1 {v17.8b}, [\arg0], x3
ld1 {v18.8b}, [\arg0], x3
ld1 {v19.8b}, [\arg0], x3
ld1 {v20.8b}, [\arg0], x3
ld1 {v21.8b}, [\arg0], x3
ld1 {v22.8b}, [\arg0], x3
ld1 {v23.8b}, [\arg0], x3
.endm
.macro CALC_ABS_8X8_1 arg0, arg1
uab\arg1\()l \arg0, v0.8b, v16.8b
uabal \arg0, v1.8b, v17.8b
uabal \arg0, v2.8b, v18.8b
uabal \arg0, v3.8b, v19.8b
uabal \arg0, v4.8b, v20.8b
uabal \arg0, v5.8b, v21.8b
uabal \arg0, v6.8b, v22.8b
uabal \arg0, v7.8b, v23.8b
.endm
.macro CALC_ABS_8X8_2 arg0
uab\arg0\()l v29.8h, v0.8b, v18.8b
uabal v29.8h, v1.8b, v19.8b
uabal v29.8h, v2.8b, v20.8b
uabal v29.8h, v3.8b, v21.8b
uabal v29.8h, v4.8b, v22.8b
uabal v29.8h, v5.8b, v23.8b
uabal v29.8h, v6.8b, v24.8b
uabal v29.8h, v7.8b, v25.8b
.endm
.macro LOAD_16X8_2 arg0
ld1 {v16.16b}, [\arg0], x3
ld1 {v17.16b}, [\arg0], x3
ld1 {v18.16b}, [\arg0], x3
ld1 {v19.16b}, [\arg0], x3
ld1 {v20.16b}, [\arg0], x3
ld1 {v21.16b}, [\arg0], x3
ld1 {v22.16b}, [\arg0], x3
ld1 {v23.16b}, [\arg0], x3
.endm
.macro CALC_ABS_16X8_1 arg0, arg1
uab\arg1\()l \arg0, v0.8b, v16.8b
uabal2 \arg0, v0.16b,v16.16b
uabal \arg0, v1.8b, v17.8b
uabal2 \arg0, v1.16b,v17.16b
uabal \arg0, v2.8b, v18.8b
uabal2 \arg0, v2.16b,v18.16b
uabal \arg0, v3.8b, v19.8b
uabal2 \arg0, v3.16b,v19.16b
uabal \arg0, v4.8b, v20.8b
uabal2 \arg0, v4.16b,v20.16b
uabal \arg0, v5.8b, v21.8b
uabal2 \arg0, v5.16b,v21.16b
uabal \arg0, v6.8b, v22.8b
uabal2 \arg0, v6.16b,v22.16b
uabal \arg0, v7.8b, v23.8b
uabal2 \arg0, v7.16b,v23.16b
.endm
.macro CALC_ABS_16X8_2 arg0
uab\arg0\()l v29.8h, v0.8b, v18.8b
uabal2 v29.8h, v0.16b,v18.16b
uabal v29.8h, v1.8b, v19.8b
uabal2 v29.8h, v1.16b,v19.16b
uabal v29.8h, v2.8b, v20.8b
uabal2 v29.8h, v2.16b,v20.16b
uabal v29.8h, v3.8b, v21.8b
uabal2 v29.8h, v3.16b,v21.16b
uabal v29.8h, v4.8b, v22.8b
uabal2 v29.8h, v4.16b,v22.16b
uabal v29.8h, v5.8b, v23.8b
uabal2 v29.8h, v5.16b,v23.16b
uabal v29.8h, v6.8b, v24.8b
uabal2 v29.8h, v6.16b,v24.16b
uabal v29.8h, v7.8b, v25.8b
uabal2 v29.8h, v7.16b,v25.16b
.endm
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[0], [x2], x3
uabdl v2.8h, v0.8b, v1.8b
.rept 3
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[0], [x2], x3
uabal v2.8h, v0.8b, v1.8b
.endr
saddlv s2, v2.4h
fmov w0, s2
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x2], x3
uabdl v2.8h, v0.8b, v1.8b
.rept 7
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x2], x3
uabal v2.8h, v0.8b, v1.8b
.endr
CALC_AND_STORE_SAD
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x2], x3
uabdl v2.8h, v0.8b, v1.8b
.rept 15
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x2], x3
uabal v2.8h, v0.8b, v1.8b
.endr
CALC_AND_STORE_SAD
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x2], x3
uabdl v2.8h, v0.8b, v1.8b
uabal2 v2.8h, v0.16b, v1.16b
.rept 7
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x2], x3
uabal v2.8h, v0.8b, v1.8b
uabal2 v2.8h, v0.16b, v1.16b
.endr
CALC_AND_STORE_SAD
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x2], x3
uabdl v2.8h, v0.8b, v1.8b
uabal2 v2.8h, v0.16b, v1.16b
.rept 15
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x2], x3
uabal v2.8h, v0.8b, v1.8b
uabal2 v2.8h, v0.16b, v1.16b
.endr
CALC_AND_STORE_SAD
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.s}[0], [x0], x1
ld1 {v0.s}[1], [x0], x1
ld1 {v1.s}[0], [x0], x1
ld1 {v1.s}[1], [x0]
sub x0, x2, x3
ld1 {v2.s}[0], [x0], x3
ld1 {v2.s}[1], [x0], x3
ld1 {v3.s}[0], [x0], x3
ld1 {v3.s}[1], [x0], x3
ld1 {v4.s}[0], [x0], x3
ld1 {v4.s}[1], [x0], x3
uabdl v28.8h, v0.8b, v2.8b
uabal v28.8h, v1.8b, v3.8b
uabdl v29.8h, v0.8b, v3.8b
uabal v29.8h, v1.8b, v4.8b
sub x0, x2, #1
ld1 {v2.s}[0], [x0], x3
ld1 {v2.s}[1], [x0], x3
ld1 {v3.s}[0], [x0], x3
ld1 {v3.s}[1], [x0]
uabdl v30.8h, v0.8b, v2.8b
uabal v30.8h, v1.8b, v3.8b
add x0, x2, #1
ld1 {v2.s}[0], [x0], x3
ld1 {v2.s}[1], [x0], x3
ld1 {v3.s}[0], [x0], x3
ld1 {v3.s}[1], [x0]
uabdl v31.8h, v0.8b, v2.8b
uabal v31.8h, v1.8b, v3.8b
CALC_AND_STORE_SAD_FOUR
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
sxtw x1, w1
sxtw x3, w3
LOAD_8X8_1
sub x0, x2, x3
LOAD_8X8_2 x0
ld1 {v24.8b}, [x0], x3
ld1 {v25.8b}, [x0]
CALC_ABS_8X8_1 v28.8h, d
CALC_ABS_8X8_2 d
sub x0, x2, #1
LOAD_8X8_2 x0
CALC_ABS_8X8_1 v30.8h, d
add x0, x2, #1
LOAD_8X8_2 x0
CALC_ABS_8X8_1 v31.8h, d
CALC_AND_STORE_SAD_FOUR
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
sxtw x1, w1
sxtw x3, w3
LOAD_8X8_1
sub x5, x2, x3
LOAD_8X8_2 x5
ld1 {v24.8b}, [x5], x3
ld1 {v25.8b}, [x5], x3
CALC_ABS_8X8_1 v28.8h, d
CALC_ABS_8X8_2 d
sub x6, x2, #1
LOAD_8X8_2 x6
CALC_ABS_8X8_1 v30.8h, d
add x7, x2, #1
LOAD_8X8_2 x7
CALC_ABS_8X8_1 v31.8h, d
LOAD_8X8_1
sub x5, x5, x3
sub x5, x5, x3
LOAD_8X8_2 x5
ld1 {v24.8b}, [x5], x3
ld1 {v25.8b}, [x5]
CALC_ABS_8X8_1 v28.8h, a
CALC_ABS_8X8_2 a
LOAD_8X8_2 x6
CALC_ABS_8X8_1 v30.8h, a
LOAD_8X8_2 x7
CALC_ABS_8X8_1 v31.8h, a
CALC_AND_STORE_SAD_FOUR
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
sxtw x1, w1
sxtw x3, w3
LOAD_16X8_1
sub x0, x2, x3
LOAD_16X8_2 x0
ld1 {v24.16b}, [x0], x3
ld1 {v25.16b}, [x0]
CALC_ABS_16X8_1 v28.8h, d
CALC_ABS_16X8_2 d
sub x0, x2, #1
LOAD_16X8_2 x0
CALC_ABS_16X8_1 v30.8h, d
add x0, x2, #1
LOAD_16X8_2 x0
CALC_ABS_16X8_1 v31.8h, d
CALC_AND_STORE_SAD_FOUR
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
sxtw x1, w1
sxtw x3, w3
LOAD_16X8_1
sub x5, x2, x3
LOAD_16X8_2 x5
ld1 {v24.16b}, [x5], x3
ld1 {v25.16b}, [x5], x3
CALC_ABS_16X8_1 v28.8h, d
CALC_ABS_16X8_2 d
sub x6, x2, #1
LOAD_16X8_2 x6
CALC_ABS_16X8_1 v30.8h, d
add x7, x2, #1
LOAD_16X8_2 x7
CALC_ABS_16X8_1 v31.8h, d
LOAD_16X8_1
sub x5, x5, x3
sub x5, x5, x3
LOAD_16X8_2 x5
ld1 {v24.16b}, [x5], x3
ld1 {v25.16b}, [x5]
CALC_ABS_16X8_1 v28.8h, a
CALC_ABS_16X8_2 a
LOAD_16X8_2 x6
CALC_ABS_16X8_1 v30.8h, a
LOAD_16X8_2 x7
CALC_ABS_16X8_1 v31.8h, a
CALC_AND_STORE_SAD_FOUR
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon
sxtw x1, w1
sxtw x3, w3
ld1 {v0.s}[0], [x0], x1
ld1 {v0.s}[1], [x0], x1
ld1 {v1.s}[0], [x0], x1
ld1 {v1.s}[1], [x0]
ld1 {v2.s}[0], [x2], x3
ld1 {v2.s}[1], [x2], x3
ld1 {v3.s}[0], [x2], x3
ld1 {v3.s}[1], [x2]
usubl v4.8h, v0.8b, v2.8b //{0,1,2,3,4,5,6,7}
usubl v5.8h, v1.8b, v3.8b //{8,9,10,11,12,13,14,15}
//Do the vertical transform
add v6.8h, v4.8h, v5.8h //{0,4,8,12,1,5,9,13}
sub v7.8h, v4.8h, v5.8h //{2,6,10,14,3,7,11,15}
mov x4, v6.d[1]
mov v6.d[1], v7.d[0]
ins v7.d[0], x4
add v4.8h, v6.8h, v7.8h
sub v5.8h, v6.8h, v7.8h
//Do the horizontal transform
trn1 v6.4s, v4.4s, v5.4s
trn2 v7.4s, v4.4s, v5.4s
add v4.8h, v6.8h, v7.8h
sub v5.8h, v6.8h, v7.8h
trn1 v6.8h, v4.8h, v5.8h
trn2 v7.8h, v4.8h, v5.8h
add v4.8h, v6.8h, v7.8h
abs v4.8h, v4.8h
saba v4.8h, v6.8h, v7.8h
uaddlv s4, v4.8h
fmov w0, s4
add w0, w0, #1
lsr w0, w0, #1
WELS_ASM_AARCH64_FUNC_END
.macro SATD_8x4
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
usubl v16.8h, v0.8b, v1.8b
ld1 {v3.8b}, [x2], x3
usubl v17.8h, v2.8b, v3.8b
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x2], x3
add v25.8h, v16.8h, v17.8h
usubl v18.8h, v4.8b, v5.8b
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x2], x3
usubl v19.8h, v6.8b, v7.8b
sub v26.8h, v16.8h, v17.8h
add v27.8h, v18.8h, v19.8h
sub v28.8h, v18.8h, v19.8h
add v0.8h, v25.8h, v27.8h
sub v1.8h, v25.8h, v27.8h
add v2.8h, v26.8h, v28.8h
sub v3.8h, v26.8h, v28.8h
trn1 v4.8h, v0.8h, v1.8h
trn2 v5.8h, v0.8h, v1.8h
trn1 v6.8h, v2.8h, v3.8h
trn2 v7.8h, v2.8h, v3.8h
add v16.8h, v4.8h, v5.8h
sabd v17.8h, v4.8h, v5.8h
abs v16.8h, v16.8h
add v18.8h, v6.8h, v7.8h
sabd v19.8h, v6.8h, v7.8h
abs v18.8h, v18.8h
trn1 v4.4s, v16.4s, v17.4s
trn2 v5.4s, v16.4s, v17.4s
trn1 v6.4s, v18.4s, v19.4s
trn2 v7.4s, v18.4s, v19.4s
smax v0.8h, v4.8h, v5.8h
smax v1.8h, v6.8h, v7.8h
.endm
.macro SATD_16x4
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x2], x3
ld1 {v2.16b}, [x0], x1
usubl v16.8h, v0.8b, v1.8b
usubl2 v24.8h, v0.16b, v1.16b
ld1 {v3.16b}, [x2], x3
usubl v17.8h, v2.8b, v3.8b
usubl2 v25.8h, v2.16b, v3.16b
ld1 {v4.16b}, [x0], x1
ld1 {v5.16b}, [x2], x3
usubl v18.8h, v4.8b, v5.8b
usubl2 v26.8h, v4.16b, v5.16b
ld1 {v6.16b}, [x0], x1
ld1 {v7.16b}, [x2], x3
usubl v19.8h, v6.8b, v7.8b
usubl2 v27.8h, v6.16b, v7.16b
add v0.8h, v16.8h, v17.8h
sub v1.8h, v16.8h, v17.8h
add v2.8h, v18.8h, v19.8h
sub v3.8h, v18.8h, v19.8h
add v4.8h, v24.8h, v25.8h
sub v5.8h, v24.8h, v25.8h
add v6.8h, v26.8h, v27.8h
sub v7.8h, v26.8h, v27.8h
add v16.8h, v0.8h, v2.8h
sub v18.8h, v0.8h, v2.8h
add v17.8h, v4.8h, v6.8h
sub v19.8h, v4.8h, v6.8h
add v0.8h, v1.8h, v3.8h
sub v2.8h, v1.8h, v3.8h
add v1.8h, v5.8h, v7.8h
sub v3.8h, v5.8h, v7.8h
trn1 v4.8h, v16.8h, v18.8h
trn2 v6.8h, v16.8h, v18.8h
trn1 v5.8h, v17.8h, v19.8h
trn2 v7.8h, v17.8h, v19.8h
add v16.8h, v4.8h, v6.8h
sabd v18.8h, v4.8h, v6.8h
add v17.8h, v5.8h, v7.8h
sabd v19.8h, v5.8h, v7.8h
abs v16.8h, v16.8h
abs v17.8h, v17.8h
trn1 v4.8h, v0.8h, v2.8h
trn2 v6.8h, v0.8h, v2.8h
trn1 v5.8h, v1.8h, v3.8h
trn2 v7.8h, v1.8h, v3.8h
add v0.8h, v4.8h, v6.8h
sabd v2.8h, v4.8h, v6.8h
add v1.8h, v5.8h, v7.8h
sabd v3.8h, v5.8h, v7.8h
abs v0.8h, v0.8h
abs v1.8h, v1.8h
trn1 v4.4s, v16.4s, v18.4s
trn2 v6.4s, v16.4s, v18.4s
trn1 v5.4s, v17.4s, v19.4s
trn2 v7.4s, v17.4s, v19.4s
trn1 v16.4s, v0.4s, v2.4s
trn2 v18.4s, v0.4s, v2.4s
trn1 v17.4s, v1.4s, v3.4s
trn2 v19.4s, v1.4s, v3.4s
smax v0.8h, v4.8h, v6.8h
smax v1.8h, v5.8h, v7.8h
smax v2.8h, v16.8h, v18.8h
smax v3.8h, v17.8h, v19.8h
add v0.8h, v0.8h, v1.8h
add v2.8h, v2.8h, v3.8h
.endm
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon
sxtw x1, w1
sxtw x3, w3
SATD_16x4
add v31.8h, v0.8h, v2.8h
.rept 3
SATD_16x4
add v31.8h, v31.8h, v0.8h
add v31.8h, v31.8h, v2.8h
.endr
uaddlv s4, v31.8h
fmov w0, s4
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon
sxtw x1, w1
sxtw x3, w3
SATD_16x4
add v31.8h, v0.8h, v2.8h
SATD_16x4
add v31.8h, v31.8h, v0.8h
add v31.8h, v31.8h, v2.8h
uaddlv s4, v31.8h
fmov w0, s4
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon
sxtw x1, w1
sxtw x3, w3
SATD_8x4
add v31.8h, v0.8h, v1.8h
.rept 3
SATD_8x4
add v31.8h, v31.8h, v0.8h
add v31.8h, v31.8h, v1.8h
.endr
uaddlv s4, v31.8h
fmov w0, s4
WELS_ASM_AARCH64_FUNC_END
WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon
sxtw x1, w1
sxtw x3, w3
SATD_8x4
add v31.8h, v0.8h, v1.8h
SATD_8x4
add v31.8h, v31.8h, v0.8h
add v31.8h, v31.8h, v1.8h
uaddlv s4, v31.8h
fmov w0, s4
WELS_ASM_AARCH64_FUNC_END
#endif