shithub: openh264

Download patch

ref: a46b1aa86e6bb10bbbc8686bd671d6b77aee31cc
parent: 3d031ee8f8ac9cb51cfd8a2af8704bf74d69f0e4
parent: 5d7e18de543fa4b8d5072eecba850c31615a475e
author: ruil2 <ruil2@cisco.com>
date: Fri Jun 6 10:05:53 EDT 2014

Merge pull request #923 from zhilwang/satd-arm64

Add arm64 neon code for Satd.

--- a/codec/encoder/core/arm64/pixel_neon_aarch64.S
+++ b/codec/encoder/core/arm64/pixel_neon_aarch64.S
@@ -474,4 +474,233 @@
 
     CALC_AND_STORE_SAD_FOUR
 WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    ld1     {v0.s}[0], [x0], x1
+    ld1     {v0.s}[1], [x0], x1
+    ld1     {v1.s}[0], [x0], x1
+    ld1     {v1.s}[1], [x0]
+
+    ld1     {v2.s}[0], [x2], x3
+    ld1     {v2.s}[1], [x2], x3
+    ld1     {v3.s}[0], [x2], x3
+    ld1     {v3.s}[1], [x2]
+    usubl   v4.8h, v0.8b, v2.8b //{0,1,2,3,4,5,6,7}
+    usubl   v5.8h, v1.8b, v3.8b //{8,9,10,11,12,13,14,15}
+
+    //Do the vertical transform
+    add     v6.8h, v4.8h, v5.8h //{0,4,8,12,1,5,9,13}
+    sub     v7.8h, v4.8h, v5.8h //{2,6,10,14,3,7,11,15}
+    mov     x4,      v6.d[1]
+    mov     v6.d[1], v7.d[0]
+    ins     v7.d[0], x4
+    add     v4.8h, v6.8h, v7.8h
+    sub     v5.8h, v6.8h, v7.8h
+
+    //Do the horizontal transform
+    trn1    v6.4s, v4.4s, v5.4s
+    trn2    v7.4s, v4.4s, v5.4s
+    add     v4.8h, v6.8h, v7.8h
+    sub     v5.8h, v6.8h, v7.8h
+    trn1    v6.8h, v4.8h, v5.8h
+    trn2    v7.8h, v4.8h, v5.8h
+    add     v4.8h, v6.8h, v7.8h
+    abs     v4.8h, v4.8h
+    saba    v4.8h, v6.8h, v7.8h
+    uaddlv  s4, v4.8h
+    fmov    w0, s4
+    add     w0, w0, #1
+    lsr     w0, w0, #1
+
+WELS_ASM_ARCH64_FUNC_END
+
+.macro SATD_8x4
+    ld1     {v0.8b}, [x0], x1
+    ld1     {v1.8b}, [x2], x3
+    ld1     {v2.8b}, [x0], x1
+    usubl   v16.8h,  v0.8b, v1.8b
+
+    ld1     {v3.8b}, [x2], x3
+    usubl   v17.8h,  v2.8b, v3.8b
+    ld1     {v4.8b}, [x0], x1
+    ld1     {v5.8b}, [x2], x3
+
+    add     v25.8h,  v16.8h, v17.8h
+    usubl   v18.8h,  v4.8b,  v5.8b
+
+    ld1     {v6.8b}, [x0], x1
+    ld1     {v7.8b}, [x2], x3
+
+    usubl   v19.8h,  v6.8b,  v7.8b
+    sub     v26.8h,  v16.8h, v17.8h
+
+    add     v27.8h,  v18.8h, v19.8h
+    sub     v28.8h,  v18.8h, v19.8h
+
+    add     v0.8h,  v25.8h, v27.8h
+    sub     v1.8h,  v25.8h, v27.8h
+
+    add     v2.8h,  v26.8h, v28.8h
+    sub     v3.8h,  v26.8h, v28.8h
+
+    trn1    v4.8h, v0.8h, v1.8h
+    trn2    v5.8h, v0.8h, v1.8h
+    trn1    v6.8h, v2.8h, v3.8h
+    trn2    v7.8h, v2.8h, v3.8h
+
+    add     v16.8h, v4.8h, v5.8h
+    sabd    v17.8h, v4.8h, v5.8h
+    abs     v16.8h, v16.8h
+    add     v18.8h, v6.8h, v7.8h
+    sabd    v19.8h, v6.8h, v7.8h
+    abs     v18.8h, v18.8h
+
+    trn1    v4.4s, v16.4s, v17.4s
+    trn2    v5.4s, v16.4s, v17.4s
+    trn1    v6.4s, v18.4s, v19.4s
+    trn2    v7.4s, v18.4s, v19.4s
+
+    smax    v0.8h, v4.8h, v5.8h
+    smax    v1.8h, v6.8h, v7.8h
+.endm
+
+.macro SATD_16x4
+    ld1     {v0.16b}, [x0], x1
+    ld1     {v1.16b}, [x2], x3
+    ld1     {v2.16b}, [x0], x1
+    usubl   v16.8h,  v0.8b, v1.8b
+    usubl2  v24.8h,  v0.16b, v1.16b
+
+    ld1     {v3.16b}, [x2], x3
+    usubl   v17.8h,  v2.8b, v3.8b
+    usubl2  v25.8h,  v2.16b, v3.16b
+
+    ld1     {v4.16b}, [x0], x1
+    ld1     {v5.16b}, [x2], x3
+    usubl   v18.8h,  v4.8b, v5.8b
+    usubl2  v26.8h,  v4.16b, v5.16b
+
+    ld1     {v6.16b}, [x0], x1
+    ld1     {v7.16b}, [x2], x3
+    usubl   v19.8h,  v6.8b, v7.8b
+    usubl2  v27.8h,  v6.16b, v7.16b
+
+    add     v0.8h,  v16.8h, v17.8h
+    sub     v1.8h,  v16.8h, v17.8h
+    add     v2.8h,  v18.8h, v19.8h
+    sub     v3.8h,  v18.8h, v19.8h
+
+    add     v4.8h,  v24.8h, v25.8h
+    sub     v5.8h,  v24.8h, v25.8h
+    add     v6.8h,  v26.8h, v27.8h
+    sub     v7.8h,  v26.8h, v27.8h
+
+    add     v16.8h,  v0.8h, v2.8h
+    sub     v18.8h,  v0.8h, v2.8h
+    add     v17.8h,  v4.8h, v6.8h
+    sub     v19.8h,  v4.8h, v6.8h
+
+    add     v0.8h,  v1.8h, v3.8h
+    sub     v2.8h,  v1.8h, v3.8h
+    add     v1.8h,  v5.8h, v7.8h
+    sub     v3.8h,  v5.8h, v7.8h
+
+    trn1    v4.8h, v16.8h, v18.8h
+    trn2    v6.8h, v16.8h, v18.8h
+    trn1    v5.8h, v17.8h, v19.8h
+    trn2    v7.8h, v17.8h, v19.8h
+
+    add     v16.8h, v4.8h, v6.8h
+    sabd    v18.8h, v4.8h, v6.8h
+    add     v17.8h, v5.8h, v7.8h
+    sabd    v19.8h, v5.8h, v7.8h
+    abs     v16.8h, v16.8h
+    abs     v17.8h, v17.8h
+
+    trn1    v4.8h, v0.8h, v2.8h
+    trn2    v6.8h, v0.8h, v2.8h
+    trn1    v5.8h, v1.8h, v3.8h
+    trn2    v7.8h, v1.8h, v3.8h
+
+    add     v0.8h, v4.8h, v6.8h
+    sabd    v2.8h, v4.8h, v6.8h
+    add     v1.8h, v5.8h, v7.8h
+    sabd    v3.8h, v5.8h, v7.8h
+    abs     v0.8h, v0.8h
+    abs     v1.8h, v1.8h
+
+    trn1    v4.4s, v16.4s, v18.4s
+    trn2    v6.4s, v16.4s, v18.4s
+    trn1    v5.4s, v17.4s, v19.4s
+    trn2    v7.4s, v17.4s, v19.4s
+
+    trn1    v16.4s, v0.4s, v2.4s
+    trn2    v18.4s, v0.4s, v2.4s
+    trn1    v17.4s, v1.4s, v3.4s
+    trn2    v19.4s, v1.4s, v3.4s
+
+    smax    v0.8h, v4.8h, v6.8h
+    smax    v1.8h, v5.8h, v7.8h
+    smax    v2.8h, v16.8h, v18.8h
+    smax    v3.8h, v17.8h, v19.8h
+    add     v0.8h, v0.8h, v1.8h
+    add     v2.8h, v2.8h, v3.8h
+.endm
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_16x4
+    add     v31.8h, v0.8h, v2.8h
+.rept 3
+    SATD_16x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v2.8h
+.endr
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_16x4
+    add     v31.8h, v0.8h, v2.8h
+
+    SATD_16x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v2.8h
+
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_8x4
+    add     v31.8h, v0.8h, v1.8h
+.rept 3
+    SATD_8x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v1.8h
+.endr
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
+
+WELS_ASM_ARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon
+    sxtw    x1, w1
+    sxtw    x3, w3
+    SATD_8x4
+    add     v31.8h, v0.8h, v1.8h
+
+    SATD_8x4
+    add     v31.8h, v31.8h, v0.8h
+    add     v31.8h, v31.8h, v1.8h
+    uaddlv  s4, v31.8h
+    fmov    w0, s4
+WELS_ASM_ARCH64_FUNC_END
 #endif
\ No newline at end of file
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -102,6 +102,13 @@
 
 #endif
 
+#if defined (HAVE_NEON_AARCH64)
+int32_t WelsSampleSatd4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t);
+#endif
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -428,6 +428,11 @@
     pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_AArch64_neon;
     pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_AArch64_neon;
 
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_AArch64_neon;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_AArch64_neon;
   }
 #endif
 }