shithub: openh264

--- a/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/dec/welsdec/welsdec.xcodeproj/project.pbxproj

@@ -30,6 +30,7 @@

 		4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };

 		4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };

 		4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };

+		6C749B6A197CC6E600A111F9 /* block_add_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */; };

 		9ABF4382193EB60900A6BD61 /* expand_pic.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9ABF4381193EB60900A6BD61 /* expand_pic.cpp */; };

 		9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66551946A1DE009A3567 /* welsCodecTrace.cpp */; };

 		9AED66591946A203009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66581946A203009A3567 /* utils.cpp */; };

@@ -108,6 +109,7 @@

 		4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };

 		4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };

 		4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };

+		6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = block_add_aarch64_neon.S; path = arm64/block_add_aarch64_neon.S; sourceTree = "<group>"; };

 		9ABF4380193EB5F700A6BD61 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = expand_pic.h; path = ../../../common/inc/expand_pic.h; sourceTree = "<group>"; };

 		9ABF4381193EB60900A6BD61 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = expand_pic.cpp; path = ../../../common/src/expand_pic.cpp; sourceTree = "<group>"; };

 		9AED66551946A1DE009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };

@@ -133,6 +135,7 @@

 		4CBC1B7F194AC4A400214D9E /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

+				6C749B69197CC6E600A111F9 /* block_add_aarch64_neon.S */,

 				4CBC1B80194AC4E100214D9E /* intra_pred_aarch64_neon.S */,

);

 			name = arm64;

@@ -353,6 +356,7 @@

 				4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,

 				4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */,

 				4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */,

+				6C749B6A197CC6E600A111F9 /* block_add_aarch64_neon.S in Sources */,

 				4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */,

 				9AED66561946A1DE009A3567 /* welsCodecTrace.cpp in Sources */,

 				F0B204FC18FD23D8005DA23F /* error_concealment.cpp in Sources */,

--- /dev/null

+++ b/codec/decoder/core/arm64/block_add_aarch64_neon.S

@@ -1,0 +1,161 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+#ifdef __APPLE__

+.macro ROW_TRANSFORM_1_STEP

+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9

+    saddl       $4.4s, $0.4h, $2.4h          //int32 e[i][0] = src[0] + src[2];

+    ssubl       $5.4s, $0.4h, $2.4h          //int32 e[i][1] = src[0] - src[2];

+    sshr        $8.4h, $1.4h, #1

+    sshr        $9.4h, $3.4h, #1

+    ssubl       $6.4s, $8.4h, $3.4h          //int32 e[i][2] = (src[1]>>1)-src[3];

+    saddl       $7.4s, $1.4h, $9.4h          //int32 e[i][3] = src[1] + (src[3]>>1);

+//  }

+.endm

+.macro TRANSFORM_4BYTES // both row & col transform used

+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];

+    add       $0.4s, $4.4s, $7.4s          //int16 f[i][0] = e[i][0] + e[i][3];

+    add       $1.4s, $5.4s, $6.4s          //int16 f[i][1] = e[i][1] + e[i][2];

+    sub       $2.4s, $5.4s, $6.4s          //int16 f[i][2] = e[i][1] - e[i][2];

+    sub       $3.4s, $4.4s, $7.4s          //int16 f[i][3] = e[i][0] - e[i][3];

+//  }

+.endm

+.macro COL_TRANSFORM_1_STEP

+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];

+    add        $4.4s, $0.4s, $2.4s          //int32 e[0][j] = f[0][j] + f[2][j];

+    sub        $5.4s, $0.4s, $2.4s          //int32 e[1][j] = f[0][j] - f[2][j];

+    sshr        $6.4s, $1.4s, #1

+    sshr        $7.4s, $3.4s, #1

+    sub        $6.4s, $6.4s, $3.4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];

+    add        $7.4s, $1.4s, $7.4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);

+//  }

+.endm

+#else

+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()

+    saddl       \arg4\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][0] = src[0] + src[2];

+    ssubl       \arg5\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][1] = src[0] - src[2];

+    sshr        \arg8\().4h, \arg1\().4h, #1

+    sshr        \arg9\().4h, \arg3\().4h, #1

+    ssubl       \arg6\().4s, \arg8\().4h, \arg3\().4h          //int32 e[i][2] = (src[1]>>1)-src[3];

+    saddl       \arg7\().4s, \arg1\().4h, \arg9\().4h          //int32 e[i][3] = src[1] + (src[3]>>1);

+//  }

+.endm

+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+// both row & col transform used

+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];

+    add       \arg0\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][0] = e[i][0] + e[i][3];

+    add       \arg1\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][1] = e[i][1] + e[i][2];

+    sub       \arg2\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][2] = e[i][1] - e[i][2];

+    sub       \arg3\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][3] = e[i][0] - e[i][3];

+//  }

+.endm

+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];

+    add        \arg4\().4s, \arg0\().4s, \arg2\().4s          //int32 e[0][j] = f[0][j] + f[2][j];

+    sub        \arg5\().4s, \arg0\().4s, \arg2\().4s          //int32 e[1][j] = f[0][j] - f[2][j];

+    sshr        \arg6\().4s, \arg1\().4s, #1

+    sshr        \arg7\().4s, \arg3\().4s, #1

+    sub        \arg6\().4s, \arg6\().4s, \arg3\().4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];

+    add        \arg7\().4s, \arg1\().4s, \arg7\().4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);

+//  }

+.endm

+#endif

+// x0    int8_t* non_zero_count,

+WELS_ASM_AARCH64_FUNC_BEGIN SetNonZeroCount_AArch64_neon

+    mov x1, x0

+    ld1 {v0.16b}, [x1], #16

+    ld1 {v1.8b}, [x1]

+    cmeq v0.16b, v0.16b, #0

+    cmeq v1.8b, v1.8b, #0

+    mvn  v0.16b, v0.16b

+    mvn  v1.8b, v1.8b

+    abs  v0.16b, v0.16b

+    abs  v1.8b, v1.8b

+    st1 {v0.16b}, [x0], #16

+    st1 {v1.8b}, [x0]

+WELS_ASM_AARCH64_FUNC_END

+//  uint8_t *pred, const int32_t stride, int16_t *rs

+WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon

+    ld4        {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]      // cost 3 cycles!

+    ROW_TRANSFORM_1_STEP        v0, v1, v2, v3, v16, v17, v18, v19, v4, v5

+    TRANSFORM_4BYTES        v0, v1, v2, v3, v16, v17, v18, v19

+    // transform element 32bits

+    trn1        v16.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]

+    trn2        v17.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[1 5 3 7]

+    trn1        v18.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]

+    trn2        v19.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[9 13 11 15]

+    trn1        v0.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]

+    trn2        v2.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[2 6 10 14]

+    trn1        v1.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]

+    trn2        v3.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[3 7 11 15]

+    COL_TRANSFORM_1_STEP        v0, v1, v2, v3, v16, v17, v18, v19

+    TRANSFORM_4BYTES        v0, v1, v2, v3, v16, v17, v18, v19

+    //after clip_table[MAX_NEG_CROP] into [0, 255]

+    mov         x2, x0

+    ld1     {v16.s}[0],[x0],x1

+    ld1     {v16.s}[1],[x0],x1

+    ld1     {v17.s}[0],[x0],x1

+    ld1     {v17.s}[1],[x0]

+    rshrn     v0.4h, v0.4s, #6

+    rshrn2    v0.8h, v1.4s, #6

+    rshrn     v1.4h, v2.4s, #6

+    rshrn2    v1.8h, v3.4s, #6

+    uxtl      v2.8h,v16.8b

+    uxtl      v3.8h,v17.8b

+    add        v2.8h, v2.8h, v0.8h

+    add        v3.8h, v3.8h, v1.8h

+    sqxtun     v0.8b,v2.8h

+    sqxtun     v1.8b,v3.8h

+    st1     {v0.s}[0],[x2],x1

+    st1     {v0.s}[1],[x2],x1

+    st1     {v1.s}[0],[x2],x1

+    st1     {v1.s}[1],[x2]

+WELS_ASM_AARCH64_FUNC_END

+#endif

--- a/codec/decoder/core/inc/decode_mb_aux.h

+++ b/codec/decoder/core/inc/decode_mb_aux.h

@@ -52,6 +52,11 @@

 void IdctResAddPred_neon (uint8_t* pred, const int32_t stride, int16_t* rs);

 #endif

+#if defined(HAVE_NEON_AARCH64)

+void IdctResAddPred_AArch64_neon (uint8_t* pred, const int32_t stride, int16_t* rs);

+#endif

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/decoder/core/inc/decode_slice.h

+++ b/codec/decoder/core/inc/decode_slice.h

@@ -67,6 +67,9 @@

 void SetNonZeroCount_neon (int8_t* pNonZeroCount);

 #endif

+#if defined(HAVE_NEON_AARCH64)

+void SetNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);

+#endif

 #ifdef __cplusplus

 #endif//__cplusplus

--- a/codec/decoder/core/src/decode_slice.cpp

+++ b/codec/decoder/core/src/decode_slice.cpp

@@ -1063,6 +1063,12 @@

     pFunc->pWelsSetNonZeroCountFunc		= SetNonZeroCount_neon;

 #endif

+#ifdef	HAVE_NEON_AARCH64

+    if (iCpu & WELS_CPU_NEON) {

+        pFunc->pWelsSetNonZeroCountFunc		= SetNonZeroCount_AArch64_neon;

+    }

+#endif

 void SetNonZeroCount_c (int8_t* pNonZeroCount) {

--- a/codec/decoder/core/src/decoder.cpp

+++ b/codec/decoder/core/src/decoder.cpp

@@ -690,7 +690,7 @@

 #if defined(HAVE_NEON_AARCH64)

   if (pCtx->uiCpuFlag & WELS_CPU_NEON) {

-    //pCtx->pIdctResAddPredFunc	= IdctResAddPred_neon;

+    pCtx->pIdctResAddPredFunc	= IdctResAddPred_AArch64_neon;

     pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_AArch64_neon;

     pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsDecoderI16x16LumaPredPlane_AArch64_neon;

--- a/codec/decoder/core/src/error_concealment.cpp

+++ b/codec/decoder/core/src/error_concealment.cpp

@@ -61,6 +61,13 @@

       pCtx->sCopyFunc.pCopyChromaFunc		= WelsCopy8x8_neon; //aligned

 #endif //HAVE_NEON

+#if defined(HAVE_NEON_AARCH64)

+    if (pCtx->uiCpuFlag & WELS_CPU_NEON) {

+        pCtx->sCopyFunc.pCopyLumaFunc		= WelsCopy16x16_AArch64_neon; //aligned

+        pCtx->sCopyFunc.pCopyChromaFunc		= WelsCopy8x8_AArch64_neon; //aligned

+    }

+#endif //HAVE_NEON_AARCH64

   } //TODO add more methods here

   return;

--- a/codec/decoder/targets.mk

+++ b/codec/decoder/targets.mk

@@ -41,6 +41,7 @@

 ifeq ($(ASM_ARCH), arm64)

 DECODER_ASM_ARM64_SRCS=\

+	$(DECODER_SRCDIR)/core/arm64/block_add_aarch64_neon.S\

 	$(DECODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\

 DECODER_OBJS += $(DECODER_ASM_ARM64_SRCS:.S=.$(OBJ))

--- a/test/decoder/DecUT_IdctResAddPred.cpp

+++ b/test/decoder/DecUT_IdctResAddPred.cpp

@@ -1,6 +1,7 @@

 #include <gtest/gtest.h>

 #include "macros.h"

 #include "decode_mb_aux.h"

+#include "../../codec/decoder/core/src/decode_slice.cpp"

 using namespace WelsDec;

 void IdctResAddPred_ref (uint8_t* pPred, const int32_t kiStride, int16_t* pRs) {

   int16_t iSrc[16];

@@ -39,6 +40,14 @@

+void SetNonZeroCount_ref (int8_t* pNonZeroCount) {

+  int32_t i;

+  for (i = 0; i < 24; i++) {

+    pNonZeroCount[i] = !!pNonZeroCount[i];

+  }

+}

 #define GENERATE_IDCTRESADDPRED(pred) \

 TEST(DecoderDecodeMbAux, pred) {\

   const int32_t kiStride = 32;\

@@ -78,4 +87,48 @@

 #if defined(HAVE_NEON)

 GENERATE_IDCTRESADDPRED (IdctResAddPred_neon)

+#endif

+#if defined(HAVE_NEON_AARCH64)

+GENERATE_IDCTRESADDPRED (IdctResAddPred_AArch64_neon)

+#endif

+#define GENERATE_SETNONZEROCOUNT(method) \

+TEST(DecoderDecodeMbAux, method) \

+{\

+    int8_t iNonZeroCount[2][24];\

+    for(int32_t i = 0; i < 24; i++) {\

+        iNonZeroCount[0][i] = iNonZeroCount[1][i] = (rand() % 256)-128;\

+    }\

+    method(iNonZeroCount[0]);\

+    SetNonZeroCount_ref(iNonZeroCount[1]);\

+    for(int32_t i =0; i<24; i++) {\

+        ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\

+    }\

+    for(int32_t i =0; i<24; i++) {\

+        iNonZeroCount[0][i] = iNonZeroCount[1][i] = -128;\

+    }\

+    method(iNonZeroCount[0]);\

+    SetNonZeroCount_ref(iNonZeroCount[1]);\

+    for(int32_t i =0; i<24; i++) {\

+        ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\

+    }\

+    for(int32_t i =0; i<24; i++) {\

+        iNonZeroCount[0][i] = iNonZeroCount[1][i] = 127;\

+    }\

+    method(iNonZeroCount[0]);\

+    SetNonZeroCount_ref(iNonZeroCount[1]);\

+    for(int32_t i =0; i<24; i++) {\

+        ASSERT_EQ (iNonZeroCount[0][i], iNonZeroCount[1][i]);\

+    }\

+}

+GENERATE_SETNONZEROCOUNT (SetNonZeroCount_c)

+#if defined(HAVE_NEON)

+GENERATE_SETNONZEROCOUNT (SetNonZeroCount_neon)

+#endif

+#if defined(HAVE_NEON_AARCH64)

+GENERATE_SETNONZEROCOUNT (SetNonZeroCount_AArch64_neon)

 #endif

--

⑨