shithub: openh264

--- a/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj

+++ b/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj

@@ -11,6 +11,7 @@

 		4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */; };

 		4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */; };

 		4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */; };

+		4CB64B48196A383F00CBF0A3 /* down_sample_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */; };

 		4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };

 		4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4475D18BC62960017DF25 /* AdaptiveQuantization.cpp */; };

 		4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4476418BC62960017DF25 /* BackgroundDetection.cpp */; };

@@ -49,6 +50,7 @@

 		4C34067518C5A4AD00DFA14A /* down_sample_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = down_sample_neon.S; sourceTree = "<group>"; };

 		4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_sad_neon.S; sourceTree = "<group>"; };

 		4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };

+		4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = down_sample_aarch64_neon.S; path = arm64/down_sample_aarch64_neon.S; sourceTree = "<group>"; };

 		4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; };

 		4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };

 		4CE4444518B724B60017DF25 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; };

@@ -112,6 +114,14 @@

 			path = arm;

 			sourceTree = "<group>";

};

+		4CB64B46196A382800CBF0A3 /* arm64 */ = {

+			isa = PBXGroup;

+			children = (

+				4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */,

+			);

+			name = arm64;

+			sourceTree = "<group>";

+		};

 		4CE4442818B724B60017DF25 = {

 			isa = PBXGroup;

 			children = (

@@ -151,6 +161,7 @@

 		4CE4475B18BC62960017DF25 /* src */ = {

 			isa = PBXGroup;

 			children = (

+				4CB64B46196A382800CBF0A3 /* arm64 */,

 				FAC77E9E18F7B09C0038A4E4 /* scrolldetection */,

 				4C34067318C5A4AD00DFA14A /* arm */,

 				4CE4475C18BC62960017DF25 /* adaptivequantization */,

@@ -338,6 +349,7 @@

 				4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */,

 				4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */,

 				FAC77EA318F7B09C0038A4E4 /* ScrollDetection.cpp in Sources */,

+				4CB64B48196A383F00CBF0A3 /* down_sample_aarch64_neon.S in Sources */,

 				4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */,

 				4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */,

 				4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */,

@@ -427,7 +439,10 @@

 				DSTROOT = /tmp/processing.dst;

 				GCC_C_LANGUAGE_STANDARD = "compiler-default";

 				GCC_OPTIMIZATION_LEVEL = 3;

-				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = APPLE_IOS;

+				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = (

+					APPLE_IOS,

+					HAVE_NEON_AARCH64,

+				);

 				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=armv7]" = (

 					APPLE_IOS,

 					HAVE_NEON,

@@ -443,6 +458,7 @@

 					"$(SRCROOT)/../../src/common",

 					"$(SRCROOT)/../../interface",

 					"$(SRCROOT)/../../../common/arm",

+					"$(SRCROOT)/../../../common/arm64",

);

 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;

 				ONLY_ACTIVE_ARCH = NO;

@@ -461,7 +477,10 @@

 				CODE_SIGN_IDENTITY = "iPhone Developer";

 				DSTROOT = /tmp/processing.dst;

 				GCC_C_LANGUAGE_STANDARD = "compiler-default";

-				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = APPLE_IOS;

+				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = (

+					APPLE_IOS,

+					HAVE_NEON_AARCH64,

+				);

 				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=armv7]" = (

 					APPLE_IOS,

 					HAVE_NEON,

@@ -477,6 +496,7 @@

 					"$(SRCROOT)/../../src/common",

 					"$(SRCROOT)/../../interface",

 					"$(SRCROOT)/../../../common/arm",

+					"$(SRCROOT)/../../../common/arm64",

);

 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;

 				OTHER_LDFLAGS = "-ObjC";

--- /dev/null

+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S

@@ -1,0 +1,226 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon

+    //Initialize the register

+    mov x6, x2

+    mov x8, x0

+    mov w9, #0

+    lsr w5, w5, #1

+    //Save the tailer   for the unasigned   size

+    smaddl  x7, w1, w5, x0

+    ld1 {v4.16b}, [x7]

+    add x7, x2, w3, sxtw

+    //processing a colume   data

+comp_ds_bilinear_loop0:

+    ld1     {v0.16b, v1.16b}, [x2], #32

+    ld1     {v2.16b, v3.16b}, [x7], #32

+    uaddlp  v0.8h, v0.16b

+    uaddlp  v1.8h, v1.16b

+    uaddlp  v2.8h, v2.16b

+    uaddlp  v3.8h, v3.16b

+    urshr   v0.8h, v0.8h, #1

+    urshr   v1.8h, v1.8h, #1

+    urshr   v2.8h, v2.8h, #1

+    urshr   v3.8h, v3.8h, #1

+    urhadd  v0.8h, v0.8h, v2.8h

+    urhadd  v1.8h, v1.8h, v3.8h

+    xtn     v0.8b, v0.8h

+    xtn     v1.8b, v1.8h

+    st1     {v0.8b, v1.8b}, [x0], #16

+    add     w9, w9, #32

+    cmp     w9, w4

+    b.cc    comp_ds_bilinear_loop0

+    mov     w9, #0

+    add     x6, x6, w3, sxtw #1

+    mov     x2, x6

+    add     x7, x2, w3, sxtw

+    add     x8, x8, w1, sxtw

+    mov     x0, x8

+    sub     w5, w5, #1

+    cbnz    w5, comp_ds_bilinear_loop0

+    //restore   the tailer for the unasigned size

+    st1     {v4.16b}, [x0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon

+    sub     w9, w3, w4

+    sub     w1, w1, w4, lsr #1

+    lsr     w5, w5, #1

+    //processing a colume   data

+comp_ds_bilinear_w_x32_loop0:

+    lsr     w6, w4, #5

+    add     x7, x2, w3, sxtw

+    //processing a line data

+comp_ds_bilinear_w_x32_loop1:

+    ld1     {v0.16b, v1.16b}, [x2], #32

+    ld1     {v2.16b, v3.16b}, [x7], #32

+    uaddlp  v0.8h, v0.16b

+    uaddlp  v1.8h, v1.16b

+    uaddlp  v2.8h, v2.16b

+    uaddlp  v3.8h, v3.16b

+    urshr   v0.8h, v0.8h, #1

+    urshr   v1.8h, v1.8h, #1

+    urshr   v2.8h, v2.8h, #1

+    urshr   v3.8h, v3.8h, #1

+    urhadd  v0.8h, v0.8h, v2.8h

+    urhadd  v1.8h, v1.8h, v3.8h

+    xtn     v0.8b, v0.8h

+    xtn     v1.8b, v1.8h

+    st1     {v0.8b, v1.8b}, [x0], #16

+    sub     w6, w6, #1

+    cbnz    w6, comp_ds_bilinear_w_x32_loop1

+    add     x2, x7, w9, sxtw

+    add     x0, x0, w1, sxtw

+    sub     w5, w5, #1

+    cbnz    w5, comp_ds_bilinear_w_x32_loop0

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon

+    mov     w10, #32767

+    and     w8, w6, w10

+    mov     w11, #-1

+    mul     w12, w11, w8

+    dup     v2.4h, w8

+    dup     v0.4h, w12

+    zip1    v0.4h, v0.4h, v2.4h     // uinc -uinc uinc -uinc

+    and     w9, w7, w10

+    mul     w12, w11, w9

+    dup     v2.4h, w9

+    dup     v5.4h, w12

+    ins     v5.s[1], v2.s[0]        // vinc vinc -vinc -vinc

+    mov     w11, #0x40000000

+    mov     w12, #0x3FFF

+    add     w11, w11, w12

+    dup     v1.2s, w11              //init u  16384 16383 16384 16383

+    mov     w8, #16384

+    dup     v7.4h, w8

+    sub     w11, w8, #1

+    dup     v2.4h, w11

+    ins     v7.s[0], v2.s[0]        //init v  16384 16384 16383 16383

+    eor     v26.16b, v26.16b, v26.16b

+    eor     v27.16b, v27.16b, v27.16b

+    sub     x1, x1, x2

+    sub     x3, x3, #1

+_HEIGHT:

+    lsr     w11, w8, #15

+    mul     w11, w11, w5

+    add     x15, x4, w11, sxtw

+    add     x12, x15, w5, sxtw

+    mov     x9, #16384

+    sub     x10, x2, #1

+    orr     v6.8b, v1.8b, v1.8b

+_WIDTH:

+    lsr     x13, x9, #15

+    add     x14, x15, x13

+    ld2     {v26.b, v27.b}[0], [x14]  //q14: 0000000b0000000a;

+    add     x14, x12, x13

+    ld2     {v26.b, v27.b}[4], [x14]  //q14: 000d000b000c000a;

+    zip1    v28.2s, v26.2s, v27.2s

+    zip2    v29.2s, v26.2s, v27.2s

+    umull   v20.4s, v6.4h, v7.4h

+    umull   v21.2d, v28.2s, v20.2s

+    ins     v20.d[0], v20.d[1]

+    umlal   v21.2d, v29.2s, v20.2s

+    addp    d21, v21.2d

+    urshr   d21, d21, #30

+    st1     {v21.b}[0], [x0], #1

+    add     x9, x9, x6

+    add     v6.4h, v6.4h, v0.4h

+    shl     v6.4h, v6.4h, #1

+    ushr    v6.4h, v6.4h, #1

+    sub     x10, x10, #1

+    cbnz    x10, _WIDTH

+WIDTH_END:

+    lsr     x9, x9, #15

+    add     x14, x15, x9

+    ld1     {v21.b}[0], [x14]

+    st1     {v21.b}[0], [x0], #1

+    add     w8, w8, w7

+    add     x0, x0, x1

+    add     v7.4h, v7.4h, v5.4h

+    shl     v7.4h, v7.4h, #1

+    ushr    v7.4h, v7.4h, #1

+    sub     x3, x3, #1

+    cbnz    x3, _HEIGHT

+LAST_ROW:

+    lsr     w8, w8, #15

+    mul     w8, w8, w5

+    add     x4, x4, w8, sxtw

+    mov     x9, #16384

+_LAST_ROW_WIDTH:

+    mov     x11, x9

+    lsr     x11, x11, #15

+    add     x3, x4, x11

+    ld1     {v21.b}[0], [x3]

+    st1     {v21.b}[0], [x0], #1

+    add     x9, x9, x6

+    sub     x2, x2, #1

+    cbnz    x2, _LAST_ROW_WIDTH

+WELS_ASM_AARCH64_FUNC_END

+#endif

\ No newline at end of file

--- a/codec/processing/src/downsample/downsample.cpp

+++ b/codec/processing/src/downsample/downsample.cpp

@@ -85,6 +85,17 @@

     sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearAccurateDownsamplerWrap_neon;

 #endif

+#if defined(HAVE_NEON_AARCH64)

+  if (iCpuFlag & WELS_CPU_NEON) {

+    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;

+    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;

+    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;

+    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;

+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

+    sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

+  }

+#endif

 EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {

--- a/codec/processing/src/downsample/downsample.h

+++ b/codec/processing/src/downsample/downsample.h

@@ -120,6 +120,21 @@

 WELSVP_EXTERN_C_END

 #endif

+#ifdef HAVE_NEON_AARCH64

+WELSVP_EXTERN_C_BEGIN

+// iSrcWidth no limitation

+HalveDownsampleFunc		DyadicBilinearDownsampler_AArch64_neon;

+// iSrcWidth = x32 pixels

+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_AArch64_neon;

+GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;

+void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,

+                                                      uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);

+WELSVP_EXTERN_C_END

+#endif

 class CDownsampling : public IStrategy {

  public:

--- a/codec/processing/src/downsample/downsamplefuncs.cpp

+++ b/codec/processing/src/downsample/downsamplefuncs.cpp

@@ -241,4 +241,15 @@

       uiScaley);

 #endif

+#ifdef HAVE_NEON_AARCH64

+void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,

+                                                  uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {

+  const int32_t kiScaleBit = 15;

+  const uint32_t kuiScale = (1 << kiScaleBit);

+  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);

+  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);

+  GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);

+}

+#endif

 WELSVP_NAMESPACE_END

--- a/codec/processing/targets.mk

+++ b/codec/processing/targets.mk

@@ -40,6 +40,13 @@

 PROCESSING_OBJS += $(PROCESSING_ASM_ARM_SRCS:.S=.$(OBJ))

 endif

+ifeq ($(ASM_ARCH), arm64)

+PROCESSING_ASM_ARM64_SRCS=\

+	$(PROCESSING_SRCDIR)/src/arm64/down_sample_aarch64_neon.S\

+PROCESSING_OBJS += $(PROCESSING_ASM_ARM64_SRCS:.S=.$(OBJ))

+endif

 OBJS += $(PROCESSING_OBJS)

 $(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.cpp

 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<

--

⑨