shithub: openh264

Download patch

ref: 5cb66bd8a763941dc18ccdc616322d0598995814
parent: f638546d81c93fbddcc0323a5dff548885c00324
author: zhiliang wang <zhilwang@cisco.com>
date: Mon Jul 7 06:25:23 EDT 2014

Add arm64 code for downsample.

--- a/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj
+++ b/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj
@@ -11,6 +11,7 @@
 		4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */; };
 		4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */; };
 		4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */; };
+		4CB64B48196A383F00CBF0A3 /* down_sample_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */; };
 		4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
 		4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4475D18BC62960017DF25 /* AdaptiveQuantization.cpp */; };
 		4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4476418BC62960017DF25 /* BackgroundDetection.cpp */; };
@@ -49,6 +50,7 @@
 		4C34067518C5A4AD00DFA14A /* down_sample_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = down_sample_neon.S; sourceTree = "<group>"; };
 		4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_sad_neon.S; sourceTree = "<group>"; };
 		4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };
+		4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = down_sample_aarch64_neon.S; path = arm64/down_sample_aarch64_neon.S; sourceTree = "<group>"; };
 		4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
 		4CE4444518B724B60017DF25 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; };
@@ -112,6 +114,14 @@
 			path = arm;
 			sourceTree = "<group>";
 		};
+		4CB64B46196A382800CBF0A3 /* arm64 */ = {
+			isa = PBXGroup;
+			children = (
+				4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */,
+			);
+			name = arm64;
+			sourceTree = "<group>";
+		};
 		4CE4442818B724B60017DF25 = {
 			isa = PBXGroup;
 			children = (
@@ -151,6 +161,7 @@
 		4CE4475B18BC62960017DF25 /* src */ = {
 			isa = PBXGroup;
 			children = (
+				4CB64B46196A382800CBF0A3 /* arm64 */,
 				FAC77E9E18F7B09C0038A4E4 /* scrolldetection */,
 				4C34067318C5A4AD00DFA14A /* arm */,
 				4CE4475C18BC62960017DF25 /* adaptivequantization */,
@@ -338,6 +349,7 @@
 				4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */,
 				4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */,
 				FAC77EA318F7B09C0038A4E4 /* ScrollDetection.cpp in Sources */,
+				4CB64B48196A383F00CBF0A3 /* down_sample_aarch64_neon.S in Sources */,
 				4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */,
 				4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */,
 				4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */,
@@ -427,7 +439,10 @@
 				DSTROOT = /tmp/processing.dst;
 				GCC_C_LANGUAGE_STANDARD = "compiler-default";
 				GCC_OPTIMIZATION_LEVEL = 3;
-				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = APPLE_IOS;
+				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = (
+					APPLE_IOS,
+					HAVE_NEON_AARCH64,
+				);
 				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=armv7]" = (
 					APPLE_IOS,
 					HAVE_NEON,
@@ -443,6 +458,7 @@
 					"$(SRCROOT)/../../src/common",
 					"$(SRCROOT)/../../interface",
 					"$(SRCROOT)/../../../common/arm",
+					"$(SRCROOT)/../../../common/arm64",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;
 				ONLY_ACTIVE_ARCH = NO;
@@ -461,7 +477,10 @@
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				DSTROOT = /tmp/processing.dst;
 				GCC_C_LANGUAGE_STANDARD = "compiler-default";
-				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = APPLE_IOS;
+				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = (
+					APPLE_IOS,
+					HAVE_NEON_AARCH64,
+				);
 				"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=armv7]" = (
 					APPLE_IOS,
 					HAVE_NEON,
@@ -477,6 +496,7 @@
 					"$(SRCROOT)/../../src/common",
 					"$(SRCROOT)/../../interface",
 					"$(SRCROOT)/../../../common/arm",
+					"$(SRCROOT)/../../../common/arm64",
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 6.1;
 				OTHER_LDFLAGS = "-ObjC";
--- /dev/null
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -1,0 +1,226 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon
+
+    //Initialize the register
+    mov x6, x2
+    mov x8, x0
+    mov w9, #0
+    lsr w5, w5, #1
+
+    //Save the tailer   for the unasigned   size
+    smaddl  x7, w1, w5, x0
+    ld1 {v4.16b}, [x7]
+
+    add x7, x2, w3, sxtw
+    //processing a colume   data
+comp_ds_bilinear_loop0:
+
+    ld1     {v0.16b, v1.16b}, [x2], #32
+    ld1     {v2.16b, v3.16b}, [x7], #32
+    uaddlp  v0.8h, v0.16b
+    uaddlp  v1.8h, v1.16b
+    uaddlp  v2.8h, v2.16b
+    uaddlp  v3.8h, v3.16b
+    urshr   v0.8h, v0.8h, #1
+    urshr   v1.8h, v1.8h, #1
+    urshr   v2.8h, v2.8h, #1
+    urshr   v3.8h, v3.8h, #1
+    urhadd  v0.8h, v0.8h, v2.8h
+    urhadd  v1.8h, v1.8h, v3.8h
+    xtn     v0.8b, v0.8h
+    xtn     v1.8b, v1.8h
+    st1     {v0.8b, v1.8b}, [x0], #16
+    add     w9, w9, #32
+
+    cmp     w9, w4
+    b.cc    comp_ds_bilinear_loop0
+
+    mov     w9, #0
+    add     x6, x6, w3, sxtw #1
+    mov     x2, x6
+    add     x7, x2, w3, sxtw
+    add     x8, x8, w1, sxtw
+    mov     x0, x8
+    sub     w5, w5, #1
+
+    cbnz    w5, comp_ds_bilinear_loop0
+
+    //restore   the tailer for the unasigned size
+    st1     {v4.16b}, [x0]
+
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
+    sub     w9, w3, w4
+    sub     w1, w1, w4, lsr #1
+    lsr     w5, w5, #1
+
+    //processing a colume   data
+comp_ds_bilinear_w_x32_loop0:
+
+    lsr     w6, w4, #5
+    add     x7, x2, w3, sxtw
+    //processing a line data
+comp_ds_bilinear_w_x32_loop1:
+
+    ld1     {v0.16b, v1.16b}, [x2], #32
+    ld1     {v2.16b, v3.16b}, [x7], #32
+    uaddlp  v0.8h, v0.16b
+    uaddlp  v1.8h, v1.16b
+    uaddlp  v2.8h, v2.16b
+    uaddlp  v3.8h, v3.16b
+    urshr   v0.8h, v0.8h, #1
+    urshr   v1.8h, v1.8h, #1
+    urshr   v2.8h, v2.8h, #1
+    urshr   v3.8h, v3.8h, #1
+    urhadd  v0.8h, v0.8h, v2.8h
+    urhadd  v1.8h, v1.8h, v3.8h
+    xtn     v0.8b, v0.8h
+    xtn     v1.8b, v1.8h
+    st1     {v0.8b, v1.8b}, [x0], #16
+
+    sub     w6, w6, #1
+    cbnz    w6, comp_ds_bilinear_w_x32_loop1
+
+    add     x2, x7, w9, sxtw
+    add     x0, x0, w1, sxtw
+    sub     w5, w5, #1
+    cbnz    w5, comp_ds_bilinear_w_x32_loop0
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
+    mov     w10, #32767
+    and     w8, w6, w10
+    mov     w11, #-1
+    mul     w12, w11, w8
+
+    dup     v2.4h, w8
+    dup     v0.4h, w12
+    zip1    v0.4h, v0.4h, v2.4h     // uinc -uinc uinc -uinc
+
+    and     w9, w7, w10
+    mul     w12, w11, w9
+
+    dup     v2.4h, w9
+    dup     v5.4h, w12
+    ins     v5.s[1], v2.s[0]        // vinc vinc -vinc -vinc
+
+    mov     w11, #0x40000000
+    mov     w12, #0x3FFF
+    add     w11, w11, w12
+    dup     v1.2s, w11              //init u  16384 16383 16384 16383
+
+    mov     w8, #16384
+    dup     v7.4h, w8
+    sub     w11, w8, #1
+    dup     v2.4h, w11
+    ins     v7.s[0], v2.s[0]        //init v  16384 16384 16383 16383
+
+    eor     v26.16b, v26.16b, v26.16b
+    eor     v27.16b, v27.16b, v27.16b
+    sub     x1, x1, x2
+    sub     x3, x3, #1
+
+_HEIGHT:
+    lsr     w11, w8, #15
+    mul     w11, w11, w5
+    add     x15, x4, w11, sxtw
+    add     x12, x15, w5, sxtw
+
+    mov     x9, #16384
+    sub     x10, x2, #1
+    orr     v6.8b, v1.8b, v1.8b
+
+_WIDTH:
+    lsr     x13, x9, #15
+    add     x14, x15, x13
+    ld2     {v26.b, v27.b}[0], [x14]  //q14: 0000000b0000000a;
+    add     x14, x12, x13
+    ld2     {v26.b, v27.b}[4], [x14]  //q14: 000d000b000c000a;
+    zip1    v28.2s, v26.2s, v27.2s
+    zip2    v29.2s, v26.2s, v27.2s
+
+    umull   v20.4s, v6.4h, v7.4h
+    umull   v21.2d, v28.2s, v20.2s
+    ins     v20.d[0], v20.d[1]
+    umlal   v21.2d, v29.2s, v20.2s
+
+    addp    d21, v21.2d
+    urshr   d21, d21, #30
+
+    st1     {v21.b}[0], [x0], #1
+    add     x9, x9, x6
+    add     v6.4h, v6.4h, v0.4h
+    shl     v6.4h, v6.4h, #1
+    ushr    v6.4h, v6.4h, #1
+    sub     x10, x10, #1
+    cbnz    x10, _WIDTH
+
+WIDTH_END:
+    lsr     x9, x9, #15
+    add     x14, x15, x9
+    ld1     {v21.b}[0], [x14]
+    st1     {v21.b}[0], [x0], #1
+    add     w8, w8, w7
+    add     x0, x0, x1
+    add     v7.4h, v7.4h, v5.4h
+    shl     v7.4h, v7.4h, #1
+    ushr    v7.4h, v7.4h, #1
+    sub     x3, x3, #1
+    cbnz    x3, _HEIGHT
+
+LAST_ROW:
+    lsr     w8, w8, #15
+    mul     w8, w8, w5
+    add     x4, x4, w8, sxtw
+    mov     x9, #16384
+
+_LAST_ROW_WIDTH:
+    mov     x11, x9
+    lsr     x11, x11, #15
+    add     x3, x4, x11
+    ld1     {v21.b}[0], [x3]
+    st1     {v21.b}[0], [x0], #1
+    add     x9, x9, x6
+    sub     x2, x2, #1
+    cbnz    x2, _LAST_ROW_WIDTH
+
+WELS_ASM_AARCH64_FUNC_END
+
+#endif
\ No newline at end of file
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -85,6 +85,17 @@
     sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearAccurateDownsamplerWrap_neon;
   }
 #endif
+
+#if defined(HAVE_NEON_AARCH64)
+  if (iCpuFlag & WELS_CPU_NEON) {
+    sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
+    sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
+    sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
+    sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
+    sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
+    sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
+  }
+#endif
 }
 
 EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -120,6 +120,21 @@
 WELSVP_EXTERN_C_END
 #endif
 
+#ifdef HAVE_NEON_AARCH64
+WELSVP_EXTERN_C_BEGIN
+// iSrcWidth no limitation
+HalveDownsampleFunc		DyadicBilinearDownsampler_AArch64_neon;
+// iSrcWidth = x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_AArch64_neon;
+
+GeneralDownsampleFunc   GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
+
+void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+                                                      uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+
+WELSVP_EXTERN_C_END
+#endif
+
 
 class CDownsampling : public IStrategy {
  public:
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -241,4 +241,15 @@
       uiScaley);
 }
 #endif
+
+#ifdef HAVE_NEON_AARCH64
+void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+                                                  uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+  const int32_t kiScaleBit = 15;
+  const uint32_t kuiScale = (1 << kiScaleBit);
+  uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
+  uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+  GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
+}
+#endif
 WELSVP_NAMESPACE_END
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -40,6 +40,13 @@
 PROCESSING_OBJS += $(PROCESSING_ASM_ARM_SRCS:.S=.$(OBJ))
 endif
 
+ifeq ($(ASM_ARCH), arm64)
+PROCESSING_ASM_ARM64_SRCS=\
+	$(PROCESSING_SRCDIR)/src/arm64/down_sample_aarch64_neon.S\
+
+PROCESSING_OBJS += $(PROCESSING_ASM_ARM64_SRCS:.S=.$(OBJ))
+endif
+
 OBJS += $(PROCESSING_OBJS)
 $(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<