ref: e66cf5369e3f65fb0c259320d158a2f25ceb9021
parent: 9a685722403870e7ef5493cce8a942188c38e4e7
parent: 5cb66bd8a763941dc18ccdc616322d0598995814
author: dongzha <dongzha@cisco.com>
date: Tue Jul 8 06:31:40 EDT 2014
Merge pull request #1091 from zhilwang/arm64-downsample Add arm64 code for downsample.
--- a/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj
+++ b/codec/processing/build/iOS/processing.xcodeproj/project.pbxproj
@@ -11,6 +11,7 @@
4C34067918C5A4AD00DFA14A /* down_sample_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067518C5A4AD00DFA14A /* down_sample_neon.S */; };
4C34067A18C5A4AD00DFA14A /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */; };
4C34067B18C5A4AD00DFA14A /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */; };
+ 4CB64B48196A383F00CBF0A3 /* down_sample_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */; };
4CE4443518B724B60017DF25 /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 4CE4443418B724B60017DF25 /* Foundation.framework */; };
4CE4478B18BC62960017DF25 /* AdaptiveQuantization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4475D18BC62960017DF25 /* AdaptiveQuantization.cpp */; };
4CE4478F18BC62960017DF25 /* BackgroundDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4476418BC62960017DF25 /* BackgroundDetection.cpp */; };
@@ -49,6 +50,7 @@
4C34067518C5A4AD00DFA14A /* down_sample_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = down_sample_neon.S; sourceTree = "<group>"; };
4C34067618C5A4AD00DFA14A /* pixel_sad_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_sad_neon.S; sourceTree = "<group>"; };
4C34067718C5A4AD00DFA14A /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };
+ 4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = down_sample_aarch64_neon.S; path = arm64/down_sample_aarch64_neon.S; sourceTree = "<group>"; };
4CE4443118B724B60017DF25 /* libprocessing.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libprocessing.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4443418B724B60017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE4444518B724B60017DF25 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = Library/Frameworks/UIKit.framework; sourceTree = DEVELOPER_DIR; };
@@ -112,6 +114,14 @@
path = arm;
sourceTree = "<group>";
};
+ 4CB64B46196A382800CBF0A3 /* arm64 */ = {
+ isa = PBXGroup;
+ children = (
+ 4CB64B47196A383F00CBF0A3 /* down_sample_aarch64_neon.S */,
+ );
+ name = arm64;
+ sourceTree = "<group>";
+ };
4CE4442818B724B60017DF25 = {
isa = PBXGroup;
children = (
@@ -151,6 +161,7 @@
4CE4475B18BC62960017DF25 /* src */ = {
isa = PBXGroup;
children = (
+ 4CB64B46196A382800CBF0A3 /* arm64 */,
FAC77E9E18F7B09C0038A4E4 /* scrolldetection */,
4C34067318C5A4AD00DFA14A /* arm */,
4CE4475C18BC62960017DF25 /* adaptivequantization */,
@@ -338,6 +349,7 @@
4CE4479B18BC62960017DF25 /* SceneChangeDetection.cpp in Sources */,
4CE4479D18BC62960017DF25 /* vaacalcfuncs.cpp in Sources */,
FAC77EA318F7B09C0038A4E4 /* ScrollDetection.cpp in Sources */,
+ 4CB64B48196A383F00CBF0A3 /* down_sample_aarch64_neon.S in Sources */,
4CE4479818BC62960017DF25 /* downsamplefuncs.cpp in Sources */,
4CE4479418BC62960017DF25 /* ComplexityAnalysis.cpp in Sources */,
4CE4479E18BC62960017DF25 /* vaacalculation.cpp in Sources */,
@@ -427,7 +439,10 @@
DSTROOT = /tmp/processing.dst;
GCC_C_LANGUAGE_STANDARD = "compiler-default";
GCC_OPTIMIZATION_LEVEL = 3;
- "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = APPLE_IOS;
+ "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = (
+ APPLE_IOS,
+ HAVE_NEON_AARCH64,
+ );
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=armv7]" = (
APPLE_IOS,
HAVE_NEON,
@@ -443,6 +458,7 @@
"$(SRCROOT)/../../src/common",
"$(SRCROOT)/../../interface",
"$(SRCROOT)/../../../common/arm",
+ "$(SRCROOT)/../../../common/arm64",
);
IPHONEOS_DEPLOYMENT_TARGET = 6.1;
ONLY_ACTIVE_ARCH = NO;
@@ -461,7 +477,10 @@
CODE_SIGN_IDENTITY = "iPhone Developer";
DSTROOT = /tmp/processing.dst;
GCC_C_LANGUAGE_STANDARD = "compiler-default";
- "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = APPLE_IOS;
+ "GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=arm64]" = (
+ APPLE_IOS,
+ HAVE_NEON_AARCH64,
+ );
"GCC_PREPROCESSOR_DEFINITIONS[sdk=iphoneos*][arch=armv7]" = (
APPLE_IOS,
HAVE_NEON,
@@ -477,6 +496,7 @@
"$(SRCROOT)/../../src/common",
"$(SRCROOT)/../../interface",
"$(SRCROOT)/../../../common/arm",
+ "$(SRCROOT)/../../../common/arm64",
);
IPHONEOS_DEPLOYMENT_TARGET = 6.1;
OTHER_LDFLAGS = "-ObjC";
--- /dev/null
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -1,0 +1,226 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon
+
+ //Initialize the register
+ mov x6, x2
+ mov x8, x0
+ mov w9, #0
+ lsr w5, w5, #1
+
+ //Save the tailer for the unasigned size
+ smaddl x7, w1, w5, x0
+ ld1 {v4.16b}, [x7]
+
+ add x7, x2, w3, sxtw
+ //processing a colume data
+comp_ds_bilinear_loop0:
+
+ ld1 {v0.16b, v1.16b}, [x2], #32
+ ld1 {v2.16b, v3.16b}, [x7], #32
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ urhadd v0.8h, v0.8h, v2.8h
+ urhadd v1.8h, v1.8h, v3.8h
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v1.8h
+ st1 {v0.8b, v1.8b}, [x0], #16
+ add w9, w9, #32
+
+ cmp w9, w4
+ b.cc comp_ds_bilinear_loop0
+
+ mov w9, #0
+ add x6, x6, w3, sxtw #1
+ mov x2, x6
+ add x7, x2, w3, sxtw
+ add x8, x8, w1, sxtw
+ mov x0, x8
+ sub w5, w5, #1
+
+ cbnz w5, comp_ds_bilinear_loop0
+
+ //restore the tailer for the unasigned size
+ st1 {v4.16b}, [x0]
+
+WELS_ASM_AARCH64_FUNC_END
+
+
+WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
+ sub w9, w3, w4
+ sub w1, w1, w4, lsr #1
+ lsr w5, w5, #1
+
+ //processing a colume data
+comp_ds_bilinear_w_x32_loop0:
+
+ lsr w6, w4, #5
+ add x7, x2, w3, sxtw
+ //processing a line data
+comp_ds_bilinear_w_x32_loop1:
+
+ ld1 {v0.16b, v1.16b}, [x2], #32
+ ld1 {v2.16b, v3.16b}, [x7], #32
+ uaddlp v0.8h, v0.16b
+ uaddlp v1.8h, v1.16b
+ uaddlp v2.8h, v2.16b
+ uaddlp v3.8h, v3.16b
+ urshr v0.8h, v0.8h, #1
+ urshr v1.8h, v1.8h, #1
+ urshr v2.8h, v2.8h, #1
+ urshr v3.8h, v3.8h, #1
+ urhadd v0.8h, v0.8h, v2.8h
+ urhadd v1.8h, v1.8h, v3.8h
+ xtn v0.8b, v0.8h
+ xtn v1.8b, v1.8h
+ st1 {v0.8b, v1.8b}, [x0], #16
+
+ sub w6, w6, #1
+ cbnz w6, comp_ds_bilinear_w_x32_loop1
+
+ add x2, x7, w9, sxtw
+ add x0, x0, w1, sxtw
+ sub w5, w5, #1
+ cbnz w5, comp_ds_bilinear_w_x32_loop0
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
+ mov w10, #32767
+ and w8, w6, w10
+ mov w11, #-1
+ mul w12, w11, w8
+
+ dup v2.4h, w8
+ dup v0.4h, w12
+ zip1 v0.4h, v0.4h, v2.4h // uinc -uinc uinc -uinc
+
+ and w9, w7, w10
+ mul w12, w11, w9
+
+ dup v2.4h, w9
+ dup v5.4h, w12
+ ins v5.s[1], v2.s[0] // vinc vinc -vinc -vinc
+
+ mov w11, #0x40000000
+ mov w12, #0x3FFF
+ add w11, w11, w12
+ dup v1.2s, w11 //init u 16384 16383 16384 16383
+
+ mov w8, #16384
+ dup v7.4h, w8
+ sub w11, w8, #1
+ dup v2.4h, w11
+ ins v7.s[0], v2.s[0] //init v 16384 16384 16383 16383
+
+ eor v26.16b, v26.16b, v26.16b
+ eor v27.16b, v27.16b, v27.16b
+ sub x1, x1, x2
+ sub x3, x3, #1
+
+_HEIGHT:
+ lsr w11, w8, #15
+ mul w11, w11, w5
+ add x15, x4, w11, sxtw
+ add x12, x15, w5, sxtw
+
+ mov x9, #16384
+ sub x10, x2, #1
+ orr v6.8b, v1.8b, v1.8b
+
+_WIDTH:
+ lsr x13, x9, #15
+ add x14, x15, x13
+ ld2 {v26.b, v27.b}[0], [x14] //q14: 0000000b0000000a;
+ add x14, x12, x13
+ ld2 {v26.b, v27.b}[4], [x14] //q14: 000d000b000c000a;
+ zip1 v28.2s, v26.2s, v27.2s
+ zip2 v29.2s, v26.2s, v27.2s
+
+ umull v20.4s, v6.4h, v7.4h
+ umull v21.2d, v28.2s, v20.2s
+ ins v20.d[0], v20.d[1]
+ umlal v21.2d, v29.2s, v20.2s
+
+ addp d21, v21.2d
+ urshr d21, d21, #30
+
+ st1 {v21.b}[0], [x0], #1
+ add x9, x9, x6
+ add v6.4h, v6.4h, v0.4h
+ shl v6.4h, v6.4h, #1
+ ushr v6.4h, v6.4h, #1
+ sub x10, x10, #1
+ cbnz x10, _WIDTH
+
+WIDTH_END:
+ lsr x9, x9, #15
+ add x14, x15, x9
+ ld1 {v21.b}[0], [x14]
+ st1 {v21.b}[0], [x0], #1
+ add w8, w8, w7
+ add x0, x0, x1
+ add v7.4h, v7.4h, v5.4h
+ shl v7.4h, v7.4h, #1
+ ushr v7.4h, v7.4h, #1
+ sub x3, x3, #1
+ cbnz x3, _HEIGHT
+
+LAST_ROW:
+ lsr w8, w8, #15
+ mul w8, w8, w5
+ add x4, x4, w8, sxtw
+ mov x9, #16384
+
+_LAST_ROW_WIDTH:
+ mov x11, x9
+ lsr x11, x11, #15
+ add x3, x4, x11
+ ld1 {v21.b}[0], [x3]
+ st1 {v21.b}[0], [x0], #1
+ add x9, x9, x6
+ sub x2, x2, #1
+ cbnz x2, _LAST_ROW_WIDTH
+
+WELS_ASM_AARCH64_FUNC_END
+
+#endif
\ No newline at end of file
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -85,6 +85,17 @@
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_neon;
}
#endif
+
+#if defined(HAVE_NEON_AARCH64)
+ if (iCpuFlag & WELS_CPU_NEON) {
+ sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
+ sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
+ sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
+ sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
+ sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
+ sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
+ }
+#endif
}
EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -120,6 +120,21 @@
WELSVP_EXTERN_C_END
#endif
+#ifdef HAVE_NEON_AARCH64
+WELSVP_EXTERN_C_BEGIN
+// iSrcWidth no limitation
+HalveDownsampleFunc DyadicBilinearDownsampler_AArch64_neon;
+// iSrcWidth = x32 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_AArch64_neon;
+
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
+
+void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+
+WELSVP_EXTERN_C_END
+#endif
+
class CDownsampling : public IStrategy {
public:
--- a/codec/processing/src/downsample/downsamplefuncs.cpp
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -241,4 +241,15 @@
uiScaley);
}
#endif
+
+#ifdef HAVE_NEON_AARCH64
+void GeneralBilinearAccurateDownsamplerWrap_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+ const int32_t kiScaleBit = 15;
+ const uint32_t kuiScale = (1 << kiScaleBit);
+ uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScale);
+ uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScale);
+ GeneralBilinearAccurateDownsampler_AArch64_neon (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley);
+}
+#endif
WELSVP_NAMESPACE_END
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -40,6 +40,13 @@
PROCESSING_OBJS += $(PROCESSING_ASM_ARM_SRCS:.S=.$(OBJ))
endif
+ifeq ($(ASM_ARCH), arm64)
+PROCESSING_ASM_ARM64_SRCS=\
+ $(PROCESSING_SRCDIR)/src/arm64/down_sample_aarch64_neon.S\
+
+PROCESSING_OBJS += $(PROCESSING_ASM_ARM64_SRCS:.S=.$(OBJ))
+endif
+
OBJS += $(PROCESSING_OBJS)
$(PROCESSING_SRCDIR)/%.$(OBJ): $(PROCESSING_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c $(CXX_O) $<