ref: 8945348c87fdf967ec59232f248c0fc05aeeca40
parent: 73d27e9776cbc977a74134782b80b103fe251b08
parent: ed341048de1d4d74af8ba9eefd7f7f2a951035e6
author: huili2 <huili2@cisco.com>
date: Thu Sep 25 11:04:46 EDT 2014
Merge pull request #1385 from ruil2/function enable ARM assembly for SampleSad16x16
--- a/build/gtest-targets.mk
+++ b/build/gtest-targets.mk
@@ -5,6 +5,7 @@
GTEST_OBJS += $(GTEST_CPP_SRCS:.cc=.$(OBJ))
OBJS += $(GTEST_OBJS)
+
$(GTEST_SRCDIR)/%.$(OBJ): $(GTEST_SRCDIR)/%.cc
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(GTEST_CFLAGS) $(GTEST_INCLUDES) -c $(CXX_O) $<
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -25,6 +25,9 @@
F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */; };
F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };
F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */; };
+ F791965419D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F791965319D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S */; };
+ F791965619D3B8A600F60C6B /* intra_pred_common_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F791965519D3B8A600F60C6B /* intra_pred_common_neon.S */; };
+ F791965919D3BE2200F60C6B /* intra_pred_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F791965819D3BE2200F60C6B /* intra_pred_common.cpp */; };
FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };
/* End PBXBuildFile section */
@@ -74,6 +77,10 @@
F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = deblocking_aarch64_neon.S; path = arm64/deblocking_aarch64_neon.S; sourceTree = "<group>"; };
F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };
F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = copy_mb_aarch64_neon.S; path = arm64/copy_mb_aarch64_neon.S; sourceTree = "<group>"; };
+ F791965319D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_common_aarch64_neon.S; path = arm64/intra_pred_common_aarch64_neon.S; sourceTree = "<group>"; };
+ F791965519D3B8A600F60C6B /* intra_pred_common_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_common_neon.S; sourceTree = "<group>"; };
+ F791965719D3BA9300F60C6B /* intra_pred_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = intra_pred_common.h; sourceTree = "<group>"; };
+ F791965819D3BE2200F60C6B /* intra_pred_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = intra_pred_common.cpp; sourceTree = "<group>"; };
FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };
FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
@@ -93,6 +100,7 @@
4C3406B118D96EA600DFA14A /* arm */ = {
isa = PBXGroup;
children = (
+ F791965519D3B8A600F60C6B /* intra_pred_common_neon.S */,
4CC61F0818FF6B4B00E56EAB /* copy_mb_neon.S */,
4C3406B218D96EA600DFA14A /* arm_arch_common_macro.S */,
4C3406B318D96EA600DFA14A /* deblocking_neon.S */,
@@ -105,6 +113,7 @@
4C3406B618D96EA600DFA14A /* inc */ = {
isa = PBXGroup;
children = (
+ F791965719D3BA9300F60C6B /* intra_pred_common.h */,
F0B204F718FD23B6005DA23F /* copy_mb.h */,
FAABAA1618E9353F00D4186F /* sad_common.h */,
4C3406B718D96EA600DFA14A /* cpu.h */,
@@ -126,6 +135,7 @@
4C3406C318D96EA600DFA14A /* src */ = {
isa = PBXGroup;
children = (
+ F791965819D3BE2200F60C6B /* intra_pred_common.cpp */,
5BA8F2BF19603F5F00011CE4 /* common_tables.cpp */,
F0B204F818FD23BF005DA23F /* copy_mb.cpp */,
FAABAA1718E9354A00D4186F /* sad_common.cpp */,
@@ -179,6 +189,7 @@
F556A81D1906669F00E156A8 /* arm64 */ = {
isa = PBXGroup;
children = (
+ F791965319D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S */,
F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */,
F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */,
F5B8D82C190757290037849A /* mc_aarch64_neon.S */,
@@ -244,12 +255,15 @@
F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */,
F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */,
4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */,
+ F791965919D3BE2200F60C6B /* intra_pred_common.cpp in Sources */,
4C3406CF18D96EA600DFA14A /* deblocking_common.cpp in Sources */,
5BA8F2C019603F5F00011CE4 /* common_tables.cpp in Sources */,
+ F791965419D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S in Sources */,
4C3406D118D96EA600DFA14A /* WelsThreadLib.cpp in Sources */,
4C3406CC18D96EA600DFA14A /* mc_neon.S in Sources */,
F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */,
4C3406CB18D96EA600DFA14A /* expand_picture_neon.S in Sources */,
+ F791965619D3B8A600F60C6B /* intra_pred_common_neon.S in Sources */,
4CC61F0918FF6B4B00E56EAB /* copy_mb_neon.S in Sources */,
53C1C9BC193F0FB000404D8F /* expand_pic.cpp in Sources */,
4C3406CD18D96EA600DFA14A /* cpu.cpp in Sources */,
--- a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
@@ -17,7 +17,6 @@
549947E2196A3FB400BA3D87 /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 549947AE196A3FB400BA3D87 /* pixel_sad_neon.S */; };
549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 549947AF196A3FB400BA3D87 /* vaa_calc_neon.S */; };
549947E4196A3FB400BA3D87 /* BackgroundDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947B1196A3FB400BA3D87 /* BackgroundDetection.cpp */; };
- 549947E5196A3FB400BA3D87 /* common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947B4196A3FB400BA3D87 /* common.cpp */; };
549947E6196A3FB400BA3D87 /* memory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947B6196A3FB400BA3D87 /* memory.cpp */; };
549947E7196A3FB400BA3D87 /* WelsFrameWork.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947BB196A3FB400BA3D87 /* WelsFrameWork.cpp */; };
549947E8196A3FB400BA3D87 /* WelsFrameWorkEx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947BD196A3FB400BA3D87 /* WelsFrameWorkEx.cpp */; };
@@ -34,6 +33,7 @@
549947F3196A3FB400BA3D87 /* vaacalcfuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */; };
549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D9196A3FB400BA3D87 /* vaacalculation.cpp */; };
6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */; };
+ F791965B19D3BF6B00F60C6B /* intra_pred_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F791965A19D3BF6B00F60C6B /* intra_pred_common.cpp */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@@ -64,7 +64,6 @@
549947AF196A3FB400BA3D87 /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };
549947B1196A3FB400BA3D87 /* BackgroundDetection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BackgroundDetection.cpp; sourceTree = "<group>"; };
549947B2196A3FB400BA3D87 /* BackgroundDetection.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BackgroundDetection.h; sourceTree = "<group>"; };
- 549947B4196A3FB400BA3D87 /* common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = common.cpp; sourceTree = "<group>"; };
549947B5196A3FB400BA3D87 /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = "<group>"; };
549947B6196A3FB400BA3D87 /* memory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory.cpp; sourceTree = "<group>"; };
549947B7196A3FB400BA3D87 /* memory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory.h; sourceTree = "<group>"; };
@@ -97,6 +96,7 @@
549947D9196A3FB400BA3D87 /* vaacalculation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalculation.cpp; sourceTree = "<group>"; };
549947DA196A3FB400BA3D87 /* vaacalculation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vaacalculation.h; sourceTree = "<group>"; };
6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = adaptive_quantization_aarch64_neon.S; path = arm64/adaptive_quantization_aarch64_neon.S; sourceTree = "<group>"; };
+ F791965A19D3BF6B00F60C6B /* intra_pred_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = intra_pred_common.cpp; path = ../../../common/src/intra_pred_common.cpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -210,7 +210,7 @@
549947B3196A3FB400BA3D87 /* common */ = {
isa = PBXGroup;
children = (
- 549947B4196A3FB400BA3D87 /* common.cpp */,
+ F791965A19D3BF6B00F60C6B /* intra_pred_common.cpp */,
549947B5196A3FB400BA3D87 /* common.h */,
549947B6196A3FB400BA3D87 /* memory.cpp */,
549947B7196A3FB400BA3D87 /* memory.h */,
@@ -351,7 +351,6 @@
549947E9196A3FB400BA3D87 /* ComplexityAnalysis.cpp in Sources */,
549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */,
549947EE196A3FB400BA3D87 /* imagerotate.cpp in Sources */,
- 549947E5196A3FB400BA3D87 /* common.cpp in Sources */,
549947EA196A3FB400BA3D87 /* denoise.cpp in Sources */,
549947E7196A3FB400BA3D87 /* WelsFrameWork.cpp in Sources */,
549947F1196A3FB400BA3D87 /* ScrollDetection.cpp in Sources */,
@@ -367,6 +366,7 @@
4CC6094F197E009D00BE8B8B /* down_sample_aarch64_neon.S in Sources */,
4CC6095A1980F34F00BE8B8B /* vaa_calc_aarch64_neon.S in Sources */,
549947F2196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp in Sources */,
+ F791965B19D3BF6B00F60C6B /* intra_pred_common.cpp in Sources */,
549947EF196A3FB400BA3D87 /* imagerotatefuncs.cpp in Sources */,
549947DF196A3FB400BA3D87 /* AdaptiveQuantization.cpp in Sources */,
549947EC196A3FB400BA3D87 /* downsample.cpp in Sources */,
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -382,6 +382,10 @@
>
</File>
<File
+ RelativePath="..\..\..\common\src\intra_pred_common.cpp"
+ >
+ </File>
+ <File
RelativePath="..\..\..\encoder\core\src\mc.cpp"
>
</File>
@@ -955,6 +959,46 @@
<Tool
Name="VCCustomBuildTool"
CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\..\common\x86\intra_pred_com.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
--- /dev/null
+++ b/codec/common/arm/intra_pred_common_neon.S
@@ -1,0 +1,83 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
+ //Get the top line data to 'q0'
+ sub r3, r1, r2
+ vldm r3, {d0, d1}
+
+ //mov r2, #16
+ mov r3, #4
+ //Set the top line to the each line of MB(16*16)
+loop_0_get_i16x16_luma_pred_v:
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ vst1.8 {d0,d1}, [r0]!
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_v
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
+ //stmdb sp!, {r4, lr}
+ sub r1, r1, #1
+ mov r3, #4
+loop_0_get_i16x16_luma_pred_h:
+ //Get one byte data from left side
+ vld1.8 {d0[],d1[]}, [r1], r2
+ vld1.8 {d2[],d3[]}, [r1], r2
+ vld1.8 {d4[],d5[]}, [r1], r2
+ vld1.8 {d6[],d7[]}, [r1], r2
+
+ //Set the line of MB using the left side byte data
+ vst1.8 {d0,d1}, [r0]!
+ //add r0, #16
+ vst1.8 {d2,d3}, [r0]!
+ //add r0, #16
+ vst1.8 {d4,d5}, [r0]!
+ //add r0, #16
+ vst1.8 {d6,d7}, [r0]!
+ //add r0, #16
+
+ subs r3, #1
+ bne loop_0_get_i16x16_luma_pred_h
+
+WELS_ASM_FUNC_END
+
+
+#endif
--- /dev/null
+++ b/codec/common/arm64/intra_pred_common_aarch64_neon.S
@@ -1,0 +1,55 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+//for Luma 16x16
+WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon
+ sub x3, x1, x2
+ ld1 {v0.16b}, [x3]
+.rept 16
+ st1 {v0.16b}, [x0], 16
+.endr
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon
+ sub x3, x1, #1
+.rept 16
+ ld1r {v0.16b}, [x3], x2
+ st1 {v0.16b}, [x0], 16
+.endr
+WELS_ASM_AARCH64_FUNC_END
+
+#endif
+
--- /dev/null
+++ b/codec/common/inc/intra_pred_common.h
@@ -1,0 +1,76 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file intra_pred_common.h
+ *
+ * \brief interfaces for intra predictor about 16x16.
+ *
+ * \date 4/2/2014 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef INTRA_PRED_COMMON_H
+#define INTRA_PRED_COMMON_H
+
+#include "typedefs.h"
+
+
+void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+//for intra-prediction ASM functions
+void WelsI16x16LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//X86_ASM
+
+#if defined(HAVE_NEON)
+void WelsI16x16LumaPredV_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_NEON
+
+#if defined(HAVE_NEON_AARCH64)
+void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_NEON_AARCH64
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+#endif//
+
+
+
--- /dev/null
+++ b/codec/common/src/intra_pred_common.cpp
@@ -1,0 +1,77 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file get_intra_predictor.c
+ *
+ * \brief implementation for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date 4/2/2009 Created
+ * 9/14/2009 C level based optimization with high performance gained.
+ * [const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
+ *
+ *************************************************************************************
+ */
+#include "ls_defines.h"
+#include "cpu_core.h"
+#include "intra_pred_common.h"
+
+
+void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+ uint8_t i = 15;
+ const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
+ const uint64_t kuiT1 = LD64 (kpSrc);
+ const uint64_t kuiT2 = LD64 (kpSrc + 8);
+ uint8_t* pDst = pPred;
+
+ do {
+ ST64 (pDst , kuiT1);
+ ST64 (pDst + 8, kuiT2);
+ pDst += 16;
+ } while (i-- > 0);
+}
+
+void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+ int32_t iStridex15 = (kiStride << 4) - kiStride;
+ int32_t iPredStride = 16;
+ int32_t iPredStridex15 = 240; //(iPredStride<<4)-iPredStride;
+ uint8_t i = 15;
+
+ do {
+ const uint8_t kuiSrc8 = pRef[iStridex15 - 1];
+ const uint64_t kuiV64 = (uint64_t) (0x0101010101010101ULL * kuiSrc8);
+ ST64 (&pPred[iPredStridex15], kuiV64);
+ ST64 (&pPred[iPredStridex15 + 8], kuiV64);
+
+ iStridex15 -= kiStride;
+ iPredStridex15 -= iPredStride;
+ } while (i-- > 0);
+}
+
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -6,6 +6,7 @@
$(COMMON_SRCDIR)/src/crt_util_safe_x.cpp\
$(COMMON_SRCDIR)/src/deblocking_common.cpp\
$(COMMON_SRCDIR)/src/expand_pic.cpp\
+ $(COMMON_SRCDIR)/src/intra_pred_common.cpp\
$(COMMON_SRCDIR)/src/sad_common.cpp\
$(COMMON_SRCDIR)/src/utils.cpp\
$(COMMON_SRCDIR)/src/welsCodecTrace.cpp\
@@ -17,6 +18,7 @@
$(COMMON_SRCDIR)/x86/cpuid.asm\
$(COMMON_SRCDIR)/x86/deblock.asm\
$(COMMON_SRCDIR)/x86/expand_picture.asm\
+ $(COMMON_SRCDIR)/x86/intra_pred_com.asm\
$(COMMON_SRCDIR)/x86/mb_copy.asm\
$(COMMON_SRCDIR)/x86/mc_chroma.asm\
$(COMMON_SRCDIR)/x86/mc_luma.asm\
@@ -33,6 +35,7 @@
$(COMMON_SRCDIR)/arm/copy_mb_neon.S\
$(COMMON_SRCDIR)/arm/deblocking_neon.S\
$(COMMON_SRCDIR)/arm/expand_picture_neon.S\
+ $(COMMON_SRCDIR)/arm/intra_pred_common_neon.S\
$(COMMON_SRCDIR)/arm/mc_neon.S\
COMMON_OBJSARM += $(COMMON_ASM_ARM_SRCS:.S=.$(OBJ))
@@ -45,6 +48,7 @@
$(COMMON_SRCDIR)/arm64/copy_mb_aarch64_neon.S\
$(COMMON_SRCDIR)/arm64/deblocking_aarch64_neon.S\
$(COMMON_SRCDIR)/arm64/expand_picture_aarch64_neon.S\
+ $(COMMON_SRCDIR)/arm64/intra_pred_common_aarch64_neon.S\
$(COMMON_SRCDIR)/arm64/mc_aarch64_neon.S\
COMMON_OBJSARM64 += $(COMMON_ASM_ARM64_SRCS:.S=.$(OBJ))
--- /dev/null
+++ b/codec/common/x86/intra_pred_com.asm
@@ -1,0 +1,117 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* intra_pred_common.asm
+;*
+;* Abstract
+;* sse2 function for intra predict operations
+;*
+;* History
+;* 18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+ add r0, 16
+ add r1, r2
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ dec r1
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+ %assign push_num 0
+ LOAD_3_PARA
+ SIGN_EXTENSION r2, r2d
+ sub r1, r2
+ movdqa xmm0, [r1]
+
+ movdqa [r0], xmm0
+ movdqa [r0+10h], xmm0
+ movdqa [r0+20h], xmm0
+ movdqa [r0+30h], xmm0
+ movdqa [r0+40h], xmm0
+ movdqa [r0+50h], xmm0
+ movdqa [r0+60h], xmm0
+ movdqa [r0+70h], xmm0
+ movdqa [r0+80h], xmm0
+ movdqa [r0+90h], xmm0
+ movdqa [r0+160], xmm0
+ movdqa [r0+176], xmm0
+ movdqa [r0+192], xmm0
+ movdqa [r0+208], xmm0
+ movdqa [r0+224], xmm0
+ movdqa [r0+240], xmm0
+
+ ret
+
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -62,51 +62,6 @@
#endif
-WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
- //Get the top line data to 'q0'
- sub r3, r1, r2
- vldm r3, {d0, d1}
-
- //mov r2, #16
- mov r3, #4
- //Set the top line to the each line of MB(16*16)
-loop_0_get_i16x16_luma_pred_v:
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- vst1.8 {d0,d1}, [r0]!
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_v
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
- //stmdb sp!, {r4, lr}
- sub r1, r1, #1
- mov r3, #4
-loop_0_get_i16x16_luma_pred_h:
- //Get one byte data from left side
- vld1.8 {d0[],d1[]}, [r1], r2
- vld1.8 {d2[],d3[]}, [r1], r2
- vld1.8 {d4[],d5[]}, [r1], r2
- vld1.8 {d6[],d7[]}, [r1], r2
-
- //Set the line of MB using the left side byte data
- vst1.8 {d0,d1}, [r0]!
- //add r0, #16
- vst1.8 {d2,d3}, [r0]!
- //add r0, #16
- vst1.8 {d4,d5}, [r0]!
- //add r0, #16
- vst1.8 {d6,d7}, [r0]!
- //add r0, #16
-
- subs r3, #1
- bne loop_0_get_i16x16_luma_pred_h
-
-WELS_ASM_FUNC_END
-
-
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
--- a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
@@ -349,23 +349,6 @@
.endr
WELS_ASM_AARCH64_FUNC_END
-//for Luma 16x16
-WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon
- sub x3, x1, x2
- ld1 {v0.16b}, [x3]
-.rept 16
- st1 {v0.16b}, [x0], 16
-.endr
-WELS_ASM_AARCH64_FUNC_END
-
-WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon
- sub x3, x1, #1
-.rept 16
- ld1r {v0.16b}, [x3], x2
- st1 {v0.16b}, [x0], 16
-.endr
-WELS_ASM_AARCH64_FUNC_END
-
WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon
sub x3, x1, x2
sub x4, x1, #1
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -74,8 +74,6 @@
void WelsI16x16ChromaPredVer (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16ChromaPredHor (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
@@ -94,8 +92,6 @@
void WelsFillingPred1to16_sse2 (uint8_t* pPred, const uint8_t kuiValue);
//for intra-prediction ASM functions
-void WelsI16x16LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredDc_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredPlane_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
@@ -116,8 +112,6 @@
#endif//X86_ASM
#if defined(HAVE_NEON)
-void WelsI16x16LumaPredV_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredDc_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredPlane_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
@@ -137,8 +131,6 @@
#endif//HAVE_NEON
#if defined(HAVE_NEON_AARCH64)
-void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -41,6 +41,7 @@
*/
#include "ls_defines.h"
#include "cpu_core.h"
+#include "intra_pred_common.h"
#include "get_intra_predictor.h"
namespace WelsEnc {
@@ -537,37 +538,6 @@
ST64 (pPred + 56, kuiDcValue64);
}
-
-void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
- uint8_t i = 15;
- const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
- const uint64_t kuiT1 = LD64 (kpSrc);
- const uint64_t kuiT2 = LD64 (kpSrc + 8);
- uint8_t* pDst = pPred;
-
- do {
- ST64 (pDst , kuiT1);
- ST64 (pDst + 8, kuiT2);
- pDst += 16;
- } while (i-- > 0);
-}
-
-void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
- int32_t iStridex15 = (kiStride << 4) - kiStride;
- int32_t iPredStride = 16;
- int32_t iPredStridex15 = 240; //(iPredStride<<4)-iPredStride;
- uint8_t i = 15;
-
- do {
- const uint8_t kuiSrc8 = pRef[iStridex15 - 1];
- const uint64_t kuiV64 = (uint64_t) (0x0101010101010101ULL * kuiSrc8);
- ST64 (&pPred[iPredStridex15], kuiV64);
- ST64 (&pPred[iPredStridex15 + 8], kuiV64);
-
- iStridex15 -= kiStride;
- iPredStridex15 -= iPredStride;
- } while (i-- > 0);
-}
void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -40,7 +40,7 @@
#include "sample.h"
#include "sad_common.h"
-
+#include "intra_pred_common.h"
#include "mc.h"
#include "cpu_core.h"
@@ -250,8 +250,8 @@
}
extern void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
-extern void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
-extern void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+//extern void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+//extern void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
int32_t* pBestMode, int32_t iLambda, uint8_t* pDst) {
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -307,74 +307,6 @@
ret
;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
- add r0, 16
- add r1, r2
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
- push r3
- %assign push_num 1
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- dec r1
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
- %assign push_num 0
- LOAD_3_PARA
- SIGN_EXTENSION r2, r2d
- sub r1, r2
- movdqa xmm0, [r1]
-
- movdqa [r0], xmm0
- movdqa [r0+10h], xmm0
- movdqa [r0+20h], xmm0
- movdqa [r0+30h], xmm0
- movdqa [r0+40h], xmm0
- movdqa [r0+50h], xmm0
- movdqa [r0+60h], xmm0
- movdqa [r0+70h], xmm0
- movdqa [r0+80h], xmm0
- movdqa [r0+90h], xmm0
- movdqa [r0+160], xmm0
- movdqa [r0+176], xmm0
- movdqa [r0+192], xmm0
- movdqa [r0+208], xmm0
- movdqa [r0+224], xmm0
- movdqa [r0+240], xmm0
-
- ret
-
-;***********************************************************************
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
--- a/codec/processing/build/win32/WelsVP.vcproj
+++ b/codec/processing/build/win32/WelsVP.vcproj
@@ -358,11 +358,11 @@
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
- RelativePath="..\..\src\common\common.cpp"
+ RelativePath="..\..\..\common\src\cpu.cpp"
>
</File>
<File
- RelativePath="..\..\..\common\src\cpu.cpp"
+ RelativePath="..\..\..\common\src\intra_pred_common.cpp"
>
</File>
<File
@@ -426,6 +426,10 @@
>
</File>
<File
+ RelativePath="..\..\..\common\inc\intra_pred_common.h"
+ >
+ </File>
+ <File
RelativePath="..\..\src\common\memory.h"
>
</File>
@@ -569,6 +573,46 @@
<Tool
Name="VCCustomBuildTool"
CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\..\common\x86\intra_pred_com.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
--- a/codec/processing/src/common/common.cpp
+++ /dev/null
@@ -1,69 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "common.h"
-#include "ls_defines.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
- uint8_t i = 15;
- const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
- const uint64_t kuiT1 = LD64 (kpSrc);
- const uint64_t kuiT2 = LD64 (kpSrc + 8);
- uint8_t* pDst = pPred;
-
- do {
- ST64 (pDst , kuiT1);
- ST64 (pDst + 8, kuiT2);
- pDst += 16;
- } while (i-- > 0);
-}
-
-void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
- int32_t iStridex15 = (kiStride << 4) - kiStride;
- int32_t iPredStride = 16;
- int32_t iPredStridex15 = 240; //(iPredStride<<4)-iPredStride;
- uint8_t i = 15;
-
- do {
- const uint8_t kuiSrc8 = pRef[iStridex15 - 1];
- const uint64_t kuiV64 = (uint64_t) (0x0101010101010101ULL * kuiSrc8);
- ST64 (&pPred[iPredStridex15], kuiV64);
- ST64 (&pPred[iPredStridex15 + 8], kuiV64);
-
- iStridex15 -= kiStride;
- iPredStridex15 -= iPredStride;
- } while (i-- > 0);
-}
-
-WELSVP_NAMESPACE_END
--- a/codec/processing/src/common/common.h
+++ b/codec/processing/src/common/common.h
@@ -38,8 +38,8 @@
*
*/
-#ifndef WELSVP_SCENECHANGEDETECTIONCOMMON_H
-#define WELSVP_SCENECHANGEDETECTIONCOMMON_H
+#ifndef WELSVP_COMMON_H
+#define WELSVP_COMMON_H
#include "util.h"
#include "memory.h"
@@ -46,7 +46,17 @@
#include "WelsFrameWork.h"
#include "IWelsVP.h"
#include "sad_common.h"
+#include "intra_pred_common.h"
+
+
+typedef void (GetIntraPred) (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+typedef GetIntraPred* GetIntraPredPtr;
+
+GetIntraPred WelsI16x16LumaPredV_c;
+GetIntraPred WelsI16x16LumaPredH_c;
+
WELSVP_NAMESPACE_BEGIN
typedef int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
@@ -56,12 +66,6 @@
typedef int32_t (Sad16x16Func) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
typedef Sad16x16Func* PSad16x16Func;
-typedef void (GetIntraPred) (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-
-typedef GetIntraPred* GetIntraPredPtr;
-
-GetIntraPred WelsI16x16LumaPredV_c;
-GetIntraPred WelsI16x16LumaPredH_c;
#ifdef HAVE_NEON
WELSVP_EXTERN_C_BEGIN
--- a/codec/processing/src/complexityanalysis/ComplexityAnalysis.cpp
+++ b/codec/processing/src/complexityanalysis/ComplexityAnalysis.cpp
@@ -33,10 +33,10 @@
#include "ComplexityAnalysis.h"
#include "cpu.h"
#include "macros.h"
+#include "intra_pred_common.h"
WELSVP_NAMESPACE_BEGIN
-
///////////////////////////////////////////////////////////////////////////////////////////////////////////////
CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
@@ -280,8 +280,29 @@
#ifdef X86_ASM
if (iCpuFlag & WELS_CPU_SSE2) {
m_pSadFunc = WelsSampleSad16x16_sse2;
+ m_pIntraFunc[0] = WelsI16x16LumaPredV_sse2;
+ m_pIntraFunc[1] = WelsI16x16LumaPredH_sse2;
+
}
#endif
+
+#if defined (HAVE_NEON)
+ if (iCpuFlag & WELS_CPU_NEON) {
+ m_pSadFunc = WelsSampleSad16x16_neon;
+ m_pIntraFunc[0] = WelsI16x16LumaPredV_neon;
+ m_pIntraFunc[1] = WelsI16x16LumaPredH_neon;
+
+ }
+#endif
+
+#if defined (HAVE_NEON_AARCH64)
+ if (iCpuFlag & WELS_CPU_NEON) {
+ m_pSadFunc = WelsSampleSad16x16_AArch64_neon;
+ m_pIntraFunc[0] = WelsI16x16LumaPredV_AArch64_neon;
+ m_pIntraFunc[1] = WelsI16x16LumaPredH_AArch64_neon;
+ }
+#endif
+
}
CComplexityAnalysisScreen::~CComplexityAnalysisScreen() {
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -2,7 +2,6 @@
PROCESSING_CPP_SRCS=\
$(PROCESSING_SRCDIR)/src/adaptivequantization/AdaptiveQuantization.cpp\
$(PROCESSING_SRCDIR)/src/backgrounddetection/BackgroundDetection.cpp\
- $(PROCESSING_SRCDIR)/src/common/common.cpp\
$(PROCESSING_SRCDIR)/src/common/memory.cpp\
$(PROCESSING_SRCDIR)/src/common/WelsFrameWork.cpp\
$(PROCESSING_SRCDIR)/src/common/WelsFrameWorkEx.cpp\