shithub: openh264

--- a/Makefile

+++ b/Makefile

@@ -89,6 +89,7 @@

 PROCESSING_INCLUDES += \

     -I$(SRC_PATH)codec/processing/interface \

     -I$(SRC_PATH)codec/processing/src/common \

+    -I$(SRC_PATH)codec/processing/src/adaptivequantization \

     -I$(SRC_PATH)codec/processing/src/scrolldetection

 GTEST_INCLUDES += \

--- a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj

@@ -31,6 +31,7 @@

 		549947F2196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D5196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp */; };

 		549947F3196A3FB400BA3D87 /* vaacalcfuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */; };

 		549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D9196A3FB400BA3D87 /* vaacalculation.cpp */; };

+		6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */; };

 /* End PBXBuildFile section */

 /* Begin PBXCopyFilesBuildPhase section */

@@ -91,6 +92,7 @@

 		549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalcfuncs.cpp; sourceTree = "<group>"; };

 		549947D9196A3FB400BA3D87 /* vaacalculation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalculation.cpp; sourceTree = "<group>"; };

 		549947DA196A3FB400BA3D87 /* vaacalculation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vaacalculation.h; sourceTree = "<group>"; };

+		6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = adaptive_quantization_aarch64_neon.S; path = arm64/adaptive_quantization_aarch64_neon.S; sourceTree = "<group>"; };

 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */

@@ -108,6 +110,7 @@

 		4CC6094D197E008B00BE8B8B /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

+				6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */,

 				4CC6094E197E009D00BE8B8B /* down_sample_aarch64_neon.S */,

);

 			name = arm64;

@@ -337,6 +340,7 @@

 			isa = PBXSourcesBuildPhase;

 			buildActionMask = 2147483647;

 			files = (

+				6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */,

 				549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */,

 				549947E9196A3FB400BA3D87 /* ComplexityAnalysis.cpp in Sources */,

 				549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */,

--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp

+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp

@@ -235,6 +235,11 @@

     pfVar = SampleVariance16x16_neon;

 #endif

+#ifdef HAVE_NEON_AARCH64

+  if (iCpuFlag & WELS_CPU_NEON) {

+    pfVar = SampleVariance16x16_AArch64_neon;

+  }

+#endif

 void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,

--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h

+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h

@@ -68,6 +68,12 @@

 WELSVP_EXTERN_C_END

 #endif

+#ifdef HAVE_NEON_AARCH64

+WELSVP_EXTERN_C_BEGIN

+VarFunc      SampleVariance16x16_AArch64_neon;

+WELSVP_EXTERN_C_END

+#endif

 class CAdaptiveQuantization : public IStrategy {

  public:

   CAdaptiveQuantization (int32_t iCpuFlag);

--- /dev/null

+++ b/codec/processing/src/arm64/adaptive_quantization_aarch64_neon.S

@@ -1,0 +1,88 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+WELS_ASM_AARCH64_FUNC_BEGIN SampleVariance16x16_AArch64_neon

+    ld1   {v1.16b}, [x0], x1 //save the ref data (16bytes)

+    ld1   {v0.16b}, [x2], x3 //save the src data (16bytes)

+    uabd  v2.16b, v0.16b, v1.16b

+    umull  v3.8h, v2.8b, v2.8b

+    umull2 v4.8h, v2.16b, v2.16b

+    uaddlp v4.4s, v4.8h

+    uadalp v4.4s, v3.8h //sqr

+    uaddlp v2.8h, v2.16b //sum

+    uaddlp v1.8h, v0.16b //sum_cur

+    umull  v3.8h, v0.8b, v0.8b

+    umull2 v5.8h, v0.16b, v0.16b

+    uaddlp v3.4s, v3.8h

+    uadalp v3.4s, v5.8h //sqr_cur

+.rept 15

+    ld1   {v5.16b}, [x0], x1 //save the ref data (16bytes)

+    ld1   {v0.16b}, [x2], x3 //save the src data (16bytes)

+    uabd  v6.16b, v0.16b, v5.16b

+    //v1 save sum_cur

+    uadalp v1.8h, v0.16b

+    //v4 save sqr

+    umull  v5.8h, v6.8b, v6.8b

+    umull2 v7.8h, v6.16b, v6.16b

+    uadalp v4.4s, v5.8h //sqr

+    uadalp v4.4s, v7.8h //sqr

+    //v2 save sum

+    uadalp v2.8h, v6.16b

+    //v3 save sqr_cur

+    umull  v5.8h, v0.8b, v0.8b

+    umull2 v7.8h, v0.16b, v0.16b

+    uadalp v3.4s, v5.8h //sqr_cur

+    uadalp v3.4s, v7.8h //sqr_cur

+.endr

+    uaddlv s2, v2.8h //sum

+    uaddlv s1, v1.8h //sum_cur

+    ins    v2.s[1], v1.s[0] // sum, sum_cur

+    shrn   v2.4h, v2.4s, #8 // sum, sum_cur>>8

+    mul    v2.4h, v2.4h, v2.4h//// sum*sum, sum_cur*sum_cur

+    uaddlv d4, v4.4s //sqr

+    uaddlv d3, v3.4s //sqr_cur

+    ins    v4.s[1], v3.s[0] // sqr, sqr_cur

+    shrn   v4.4h, v4.4s, #8 // sqr, sqr_cur>>8

+    sub    v4.4h, v4.4h, v2.4h

+    st1  {v4.s}[0], [x4]

+WELS_ASM_AARCH64_FUNC_END

+#endif

--- a/codec/processing/targets.mk

+++ b/codec/processing/targets.mk

@@ -42,6 +42,7 @@

 ifeq ($(ASM_ARCH), arm64)

 PROCESSING_ASM_ARM64_SRCS=\

+	$(PROCESSING_SRCDIR)/src/arm64/adaptive_quantization_aarch64_neon.S\

 	$(PROCESSING_SRCDIR)/src/arm64/down_sample_aarch64_neon.S\

 PROCESSING_OBJS += $(PROCESSING_ASM_ARM64_SRCS:.S=.$(OBJ))

--- /dev/null

+++ b/test/processing/ProcessUT_AdaptiveQuantization.cpp

@@ -1,0 +1,76 @@

+#include <gtest/gtest.h>

+#include <math.h>

+#include <string.h>

+#include "cpu.h"

+#include "cpu_core.h"

+#include "util.h"

+#include "macros.h"

+#include "IWelsVP.h"

+#include "AdaptiveQuantization.h"

+using namespace nsWelsVP;

+static void FillWithRandomData (uint8_t* p, int32_t Len) {

+  for (int32_t i = 0; i < Len; i++) {

+    p[i] = rand() % 256;

+  }

+}

+void SampleVariance16x16_ref (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,

+                              SMotionTextureUnit* pMotionTexture) {

+  uint32_t uiCurSquare = 0,  uiSquare = 0;

+  uint16_t uiCurSum = 0,  uiSum = 0;

+  for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {

+    for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {

+      uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);

+      uiSum += uiDiff;

+      uiSquare += uiDiff * uiDiff;

+      uiCurSum += pSrcY[x];

+      uiCurSquare += pSrcY[x] * pSrcY[x];

+    }

+    pRefY += iRefStride;

+    pSrcY += iSrcStride;

+  }

+  uiSum = uiSum >> 8;

+  pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);

+  uiCurSum = uiCurSum >> 8;

+  pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);

+}

+#define GENERATE_AQTEST(method) \

+TEST (AdaptiveQuantization, method) {\

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pRefY,32*16,16)\

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pSrcY,48*16,16)\

+  SMotionTextureUnit pMotionTexture[2];\

+  FillWithRandomData (pRefY,32*16);\

+  FillWithRandomData (pSrcY,48*16);\

+  SampleVariance16x16_ref (pRefY,32,pSrcY,48,&pMotionTexture[0]);\

+  method(pRefY,32,pSrcY,48,&pMotionTexture[1]);\

+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\

+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\

+  memset (pRefY,0,32*16);\

+  memset (pSrcY,255,48*16);\

+  SampleVariance16x16_ref (pRefY,32,pSrcY,48,&pMotionTexture[0]);\

+  method(pRefY,32,pSrcY,48,&pMotionTexture[1]);\

+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\

+  ASSERT_EQ(pMotionTexture[0].uiMotionIndex,pMotionTexture[1].uiMotionIndex);\

+}

+GENERATE_AQTEST (SampleVariance16x16_c)

+#if defined(X86_ASM)

+GENERATE_AQTEST (SampleVariance16x16_sse2)

+#endif

+#if defined(HAVE_NEON)

+GENERATE_AQTEST (SampleVariance16x16_neon)

+#endif

+#if defined(HAVE_NEON_AARCH64)

+GENERATE_AQTEST (SampleVariance16x16_AArch64_neon)

+#endif

--- a/test/processing/targets.mk

+++ b/test/processing/targets.mk

@@ -1,5 +1,6 @@

 PROCESSING_UNITTEST_SRCDIR=test/processing

 PROCESSING_UNITTEST_CPP_SRCS=\

+	$(PROCESSING_UNITTEST_SRCDIR)/ProcessUT_AdaptiveQuantization.cpp\

 	$(PROCESSING_UNITTEST_SRCDIR)/ProcessUT_ScrollDetection.cpp\

 PROCESSING_UNITTEST_OBJS += $(PROCESSING_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))

--

⑨