shithub: openh264

Download patch

ref: 8945348c87fdf967ec59232f248c0fc05aeeca40
parent: 73d27e9776cbc977a74134782b80b103fe251b08
parent: ed341048de1d4d74af8ba9eefd7f7f2a951035e6
author: huili2 <huili2@cisco.com>
date: Thu Sep 25 11:04:46 EDT 2014

Merge pull request #1385 from ruil2/function

enable ARM assembly for SampleSad16x16

--- a/build/gtest-targets.mk
+++ b/build/gtest-targets.mk
@@ -5,6 +5,7 @@
 GTEST_OBJS += $(GTEST_CPP_SRCS:.cc=.$(OBJ))
 
 OBJS += $(GTEST_OBJS)
+
 $(GTEST_SRCDIR)/%.$(OBJ): $(GTEST_SRCDIR)/%.cc
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(GTEST_CFLAGS) $(GTEST_INCLUDES) -c $(CXX_O) $<
 
--- a/codec/build/iOS/common/common.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/common/common.xcodeproj/project.pbxproj
@@ -25,6 +25,9 @@
 		F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */; };
 		F5B8D82D190757290037849A /* mc_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5B8D82C190757290037849A /* mc_aarch64_neon.S */; };
 		F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */; };
+		F791965419D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F791965319D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S */; };
+		F791965619D3B8A600F60C6B /* intra_pred_common_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F791965519D3B8A600F60C6B /* intra_pred_common_neon.S */; };
+		F791965919D3BE2200F60C6B /* intra_pred_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F791965819D3BE2200F60C6B /* intra_pred_common.cpp */; };
 		FAABAA1818E9354A00D4186F /* sad_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FAABAA1718E9354A00D4186F /* sad_common.cpp */; };
 /* End PBXBuildFile section */
 
@@ -74,6 +77,10 @@
 		F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = deblocking_aarch64_neon.S; path = arm64/deblocking_aarch64_neon.S; sourceTree = "<group>"; };
 		F5B8D82C190757290037849A /* mc_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = mc_aarch64_neon.S; path = arm64/mc_aarch64_neon.S; sourceTree = "<group>"; };
 		F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = copy_mb_aarch64_neon.S; path = arm64/copy_mb_aarch64_neon.S; sourceTree = "<group>"; };
+		F791965319D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = intra_pred_common_aarch64_neon.S; path = arm64/intra_pred_common_aarch64_neon.S; sourceTree = "<group>"; };
+		F791965519D3B8A600F60C6B /* intra_pred_common_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_common_neon.S; sourceTree = "<group>"; };
+		F791965719D3BA9300F60C6B /* intra_pred_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = intra_pred_common.h; sourceTree = "<group>"; };
+		F791965819D3BE2200F60C6B /* intra_pred_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = intra_pred_common.cpp; sourceTree = "<group>"; };
 		FAABAA1618E9353F00D4186F /* sad_common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sad_common.h; sourceTree = "<group>"; };
 		FAABAA1718E9354A00D4186F /* sad_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sad_common.cpp; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -93,6 +100,7 @@
 		4C3406B118D96EA600DFA14A /* arm */ = {
 			isa = PBXGroup;
 			children = (
+				F791965519D3B8A600F60C6B /* intra_pred_common_neon.S */,
 				4CC61F0818FF6B4B00E56EAB /* copy_mb_neon.S */,
 				4C3406B218D96EA600DFA14A /* arm_arch_common_macro.S */,
 				4C3406B318D96EA600DFA14A /* deblocking_neon.S */,
@@ -105,6 +113,7 @@
 		4C3406B618D96EA600DFA14A /* inc */ = {
 			isa = PBXGroup;
 			children = (
+				F791965719D3BA9300F60C6B /* intra_pred_common.h */,
 				F0B204F718FD23B6005DA23F /* copy_mb.h */,
 				FAABAA1618E9353F00D4186F /* sad_common.h */,
 				4C3406B718D96EA600DFA14A /* cpu.h */,
@@ -126,6 +135,7 @@
 		4C3406C318D96EA600DFA14A /* src */ = {
 			isa = PBXGroup;
 			children = (
+				F791965819D3BE2200F60C6B /* intra_pred_common.cpp */,
 				5BA8F2BF19603F5F00011CE4 /* common_tables.cpp */,
 				F0B204F818FD23BF005DA23F /* copy_mb.cpp */,
 				FAABAA1718E9354A00D4186F /* sad_common.cpp */,
@@ -179,6 +189,7 @@
 		F556A81D1906669F00E156A8 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				F791965319D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S */,
 				F5BB0BB7196BB5960072D50D /* copy_mb_aarch64_neon.S */,
 				F5AC94FE193EB7D800F58154 /* deblocking_aarch64_neon.S */,
 				F5B8D82C190757290037849A /* mc_aarch64_neon.S */,
@@ -244,12 +255,15 @@
 				F556A8241906673900E156A8 /* arm_arch64_common_macro.S in Sources */,
 				F5AC94FF193EB7D800F58154 /* deblocking_aarch64_neon.S in Sources */,
 				4C3406CE18D96EA600DFA14A /* crt_util_safe_x.cpp in Sources */,
+				F791965919D3BE2200F60C6B /* intra_pred_common.cpp in Sources */,
 				4C3406CF18D96EA600DFA14A /* deblocking_common.cpp in Sources */,
 				5BA8F2C019603F5F00011CE4 /* common_tables.cpp in Sources */,
+				F791965419D3B89D00F60C6B /* intra_pred_common_aarch64_neon.S in Sources */,
 				4C3406D118D96EA600DFA14A /* WelsThreadLib.cpp in Sources */,
 				4C3406CC18D96EA600DFA14A /* mc_neon.S in Sources */,
 				F5BB0BB8196BB5960072D50D /* copy_mb_aarch64_neon.S in Sources */,
 				4C3406CB18D96EA600DFA14A /* expand_picture_neon.S in Sources */,
+				F791965619D3B8A600F60C6B /* intra_pred_common_neon.S in Sources */,
 				4CC61F0918FF6B4B00E56EAB /* copy_mb_neon.S in Sources */,
 				53C1C9BC193F0FB000404D8F /* expand_pic.cpp in Sources */,
 				4C3406CD18D96EA600DFA14A /* cpu.cpp in Sources */,
--- a/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/processing/processing.xcodeproj/project.pbxproj
@@ -17,7 +17,6 @@
 		549947E2196A3FB400BA3D87 /* pixel_sad_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 549947AE196A3FB400BA3D87 /* pixel_sad_neon.S */; };
 		549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 549947AF196A3FB400BA3D87 /* vaa_calc_neon.S */; };
 		549947E4196A3FB400BA3D87 /* BackgroundDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947B1196A3FB400BA3D87 /* BackgroundDetection.cpp */; };
-		549947E5196A3FB400BA3D87 /* common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947B4196A3FB400BA3D87 /* common.cpp */; };
 		549947E6196A3FB400BA3D87 /* memory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947B6196A3FB400BA3D87 /* memory.cpp */; };
 		549947E7196A3FB400BA3D87 /* WelsFrameWork.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947BB196A3FB400BA3D87 /* WelsFrameWork.cpp */; };
 		549947E8196A3FB400BA3D87 /* WelsFrameWorkEx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947BD196A3FB400BA3D87 /* WelsFrameWorkEx.cpp */; };
@@ -34,6 +33,7 @@
 		549947F3196A3FB400BA3D87 /* vaacalcfuncs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D8196A3FB400BA3D87 /* vaacalcfuncs.cpp */; };
 		549947F4196A3FB400BA3D87 /* vaacalculation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 549947D9196A3FB400BA3D87 /* vaacalculation.cpp */; };
 		6C749B78197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */; };
+		F791965B19D3BF6B00F60C6B /* intra_pred_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F791965A19D3BF6B00F60C6B /* intra_pred_common.cpp */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -64,7 +64,6 @@
 		549947AF196A3FB400BA3D87 /* vaa_calc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = vaa_calc_neon.S; sourceTree = "<group>"; };
 		549947B1196A3FB400BA3D87 /* BackgroundDetection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = BackgroundDetection.cpp; sourceTree = "<group>"; };
 		549947B2196A3FB400BA3D87 /* BackgroundDetection.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BackgroundDetection.h; sourceTree = "<group>"; };
-		549947B4196A3FB400BA3D87 /* common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = common.cpp; sourceTree = "<group>"; };
 		549947B5196A3FB400BA3D87 /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = "<group>"; };
 		549947B6196A3FB400BA3D87 /* memory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memory.cpp; sourceTree = "<group>"; };
 		549947B7196A3FB400BA3D87 /* memory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = memory.h; sourceTree = "<group>"; };
@@ -97,6 +96,7 @@
 		549947D9196A3FB400BA3D87 /* vaacalculation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vaacalculation.cpp; sourceTree = "<group>"; };
 		549947DA196A3FB400BA3D87 /* vaacalculation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vaacalculation.h; sourceTree = "<group>"; };
 		6C749B77197E2A2000A111F9 /* adaptive_quantization_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = adaptive_quantization_aarch64_neon.S; path = arm64/adaptive_quantization_aarch64_neon.S; sourceTree = "<group>"; };
+		F791965A19D3BF6B00F60C6B /* intra_pred_common.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = intra_pred_common.cpp; path = ../../../common/src/intra_pred_common.cpp; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -210,7 +210,7 @@
 		549947B3196A3FB400BA3D87 /* common */ = {
 			isa = PBXGroup;
 			children = (
-				549947B4196A3FB400BA3D87 /* common.cpp */,
+				F791965A19D3BF6B00F60C6B /* intra_pred_common.cpp */,
 				549947B5196A3FB400BA3D87 /* common.h */,
 				549947B6196A3FB400BA3D87 /* memory.cpp */,
 				549947B7196A3FB400BA3D87 /* memory.h */,
@@ -351,7 +351,6 @@
 				549947E9196A3FB400BA3D87 /* ComplexityAnalysis.cpp in Sources */,
 				549947E3196A3FB400BA3D87 /* vaa_calc_neon.S in Sources */,
 				549947EE196A3FB400BA3D87 /* imagerotate.cpp in Sources */,
-				549947E5196A3FB400BA3D87 /* common.cpp in Sources */,
 				549947EA196A3FB400BA3D87 /* denoise.cpp in Sources */,
 				549947E7196A3FB400BA3D87 /* WelsFrameWork.cpp in Sources */,
 				549947F1196A3FB400BA3D87 /* ScrollDetection.cpp in Sources */,
@@ -367,6 +366,7 @@
 				4CC6094F197E009D00BE8B8B /* down_sample_aarch64_neon.S in Sources */,
 				4CC6095A1980F34F00BE8B8B /* vaa_calc_aarch64_neon.S in Sources */,
 				549947F2196A3FB400BA3D87 /* ScrollDetectionFuncs.cpp in Sources */,
+				F791965B19D3BF6B00F60C6B /* intra_pred_common.cpp in Sources */,
 				549947EF196A3FB400BA3D87 /* imagerotatefuncs.cpp in Sources */,
 				549947DF196A3FB400BA3D87 /* AdaptiveQuantization.cpp in Sources */,
 				549947EC196A3FB400BA3D87 /* downsample.cpp in Sources */,
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -382,6 +382,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\common\src\intra_pred_common.cpp"
+				>
+			</File>
+			<File
 				RelativePath="..\..\..\encoder\core\src\mc.cpp"
 				>
 			</File>
@@ -955,6 +959,46 @@
 					<Tool
 						Name="VCCustomBuildTool"
 						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\common\x86\intra_pred_com.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
--- /dev/null
+++ b/codec/common/arm/intra_pred_common_neon.S
@@ -1,0 +1,83 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifdef  HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
+    //Get the top line data to 'q0'
+    sub  r3, r1, r2
+    vldm r3, {d0, d1}
+
+    //mov  r2, #16
+    mov  r3, #4
+    //Set the top line to the each line of MB(16*16)
+loop_0_get_i16x16_luma_pred_v:
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_v
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
+    //stmdb sp!, {r4, lr}
+    sub  r1, r1, #1
+    mov  r3, #4
+loop_0_get_i16x16_luma_pred_h:
+    //Get one byte data from left side
+    vld1.8 {d0[],d1[]}, [r1], r2
+    vld1.8 {d2[],d3[]}, [r1], r2
+    vld1.8 {d4[],d5[]}, [r1], r2
+    vld1.8 {d6[],d7[]}, [r1], r2
+
+    //Set the line of MB using the left side byte data
+    vst1.8 {d0,d1}, [r0]!
+    //add r0, #16
+    vst1.8 {d2,d3}, [r0]!
+    //add r0, #16
+    vst1.8 {d4,d5}, [r0]!
+    //add r0, #16
+    vst1.8 {d6,d7}, [r0]!
+    //add r0, #16
+
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_h
+
+WELS_ASM_FUNC_END
+
+
+#endif
--- /dev/null
+++ b/codec/common/arm64/intra_pred_common_aarch64_neon.S
@@ -1,0 +1,55 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef  HAVE_NEON_AARCH64
+.text
+#include "arm_arch64_common_macro.S"
+
+//for Luma 16x16
+WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon
+    sub     x3, x1, x2
+    ld1     {v0.16b}, [x3]
+.rept 16
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_AARCH64_FUNC_END
+
+WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon
+    sub     x3, x1, #1
+.rept 16
+    ld1r    {v0.16b}, [x3], x2
+    st1     {v0.16b}, [x0], 16
+.endr
+WELS_ASM_AARCH64_FUNC_END
+
+#endif
+
--- /dev/null
+++ b/codec/common/inc/intra_pred_common.h
@@ -1,0 +1,76 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	intra_pred_common.h
+ *
+ * \brief	interfaces for intra predictor about 16x16.
+ *
+ * \date	4/2/2014 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef INTRA_PRED_COMMON_H
+#define INTRA_PRED_COMMON_H
+
+#include "typedefs.h"
+
+
+void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+//for intra-prediction ASM functions
+void WelsI16x16LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//X86_ASM
+
+#if defined(HAVE_NEON)
+void WelsI16x16LumaPredV_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_NEON
+
+#if defined(HAVE_NEON_AARCH64)
+void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_NEON_AARCH64
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+#endif//
+
+
+
--- /dev/null
+++ b/codec/common/src/intra_pred_common.cpp
@@ -1,0 +1,77 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	get_intra_predictor.c
+ *
+ * \brief	implementation for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date	4/2/2009 Created
+ *			9/14/2009 C level based optimization with high performance gained.
+ *				[const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
+ *
+ *************************************************************************************
+ */
+#include "ls_defines.h"
+#include "cpu_core.h"
+#include "intra_pred_common.h"
+
+
+void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  uint8_t i = 15;
+  const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
+  const uint64_t kuiT1 = LD64 (kpSrc);
+  const uint64_t kuiT2 = LD64 (kpSrc + 8);
+  uint8_t* pDst = pPred;
+
+  do {
+    ST64 (pDst  , kuiT1);
+    ST64 (pDst + 8, kuiT2);
+    pDst += 16;
+  } while (i-- > 0);
+}
+
+void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
+  int32_t iStridex15 = (kiStride << 4) - kiStride;
+  int32_t iPredStride = 16;
+  int32_t iPredStridex15 = 240;	//(iPredStride<<4)-iPredStride;
+  uint8_t i = 15;
+
+  do {
+    const uint8_t kuiSrc8	= pRef[iStridex15 - 1];
+    const uint64_t kuiV64	= (uint64_t) (0x0101010101010101ULL * kuiSrc8);
+    ST64 (&pPred[iPredStridex15], kuiV64);
+    ST64 (&pPred[iPredStridex15 + 8], kuiV64);
+
+    iStridex15 -= kiStride;
+    iPredStridex15 -= iPredStride;
+  } while (i-- > 0);
+}
+
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -6,6 +6,7 @@
 	$(COMMON_SRCDIR)/src/crt_util_safe_x.cpp\
 	$(COMMON_SRCDIR)/src/deblocking_common.cpp\
 	$(COMMON_SRCDIR)/src/expand_pic.cpp\
+	$(COMMON_SRCDIR)/src/intra_pred_common.cpp\
 	$(COMMON_SRCDIR)/src/sad_common.cpp\
 	$(COMMON_SRCDIR)/src/utils.cpp\
 	$(COMMON_SRCDIR)/src/welsCodecTrace.cpp\
@@ -17,6 +18,7 @@
 	$(COMMON_SRCDIR)/x86/cpuid.asm\
 	$(COMMON_SRCDIR)/x86/deblock.asm\
 	$(COMMON_SRCDIR)/x86/expand_picture.asm\
+	$(COMMON_SRCDIR)/x86/intra_pred_com.asm\
 	$(COMMON_SRCDIR)/x86/mb_copy.asm\
 	$(COMMON_SRCDIR)/x86/mc_chroma.asm\
 	$(COMMON_SRCDIR)/x86/mc_luma.asm\
@@ -33,6 +35,7 @@
 	$(COMMON_SRCDIR)/arm/copy_mb_neon.S\
 	$(COMMON_SRCDIR)/arm/deblocking_neon.S\
 	$(COMMON_SRCDIR)/arm/expand_picture_neon.S\
+	$(COMMON_SRCDIR)/arm/intra_pred_common_neon.S\
 	$(COMMON_SRCDIR)/arm/mc_neon.S\
 
 COMMON_OBJSARM += $(COMMON_ASM_ARM_SRCS:.S=.$(OBJ))
@@ -45,6 +48,7 @@
 	$(COMMON_SRCDIR)/arm64/copy_mb_aarch64_neon.S\
 	$(COMMON_SRCDIR)/arm64/deblocking_aarch64_neon.S\
 	$(COMMON_SRCDIR)/arm64/expand_picture_aarch64_neon.S\
+	$(COMMON_SRCDIR)/arm64/intra_pred_common_aarch64_neon.S\
 	$(COMMON_SRCDIR)/arm64/mc_aarch64_neon.S\
 
 COMMON_OBJSARM64 += $(COMMON_ASM_ARM64_SRCS:.S=.$(OBJ))
--- /dev/null
+++ b/codec/common/x86/intra_pred_com.asm
@@ -1,0 +1,117 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred_common.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+    add r0, 16
+    add r1, r2
+    movzx r3, byte [r1]
+    SSE2_Copy16Times xmm0, r3d
+    movdqa [r0], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+    push r3
+    %assign push_num 1
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    dec r1
+    movzx r3, byte [r1]
+    SSE2_Copy16Times xmm0, r3d
+    movdqa [r0], xmm0
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    pop r3
+    ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub     r1, r2
+    movdqa  xmm0, [r1]
+
+    movdqa  [r0], xmm0
+    movdqa  [r0+10h], xmm0
+    movdqa  [r0+20h], xmm0
+    movdqa  [r0+30h], xmm0
+    movdqa  [r0+40h], xmm0
+    movdqa  [r0+50h], xmm0
+    movdqa  [r0+60h], xmm0
+    movdqa  [r0+70h], xmm0
+    movdqa  [r0+80h], xmm0
+    movdqa  [r0+90h], xmm0
+    movdqa  [r0+160], xmm0
+    movdqa  [r0+176], xmm0
+    movdqa  [r0+192], xmm0
+    movdqa  [r0+208], xmm0
+    movdqa  [r0+224], xmm0
+    movdqa  [r0+240], xmm0
+
+    ret
+
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -62,51 +62,6 @@
 #endif
 
 
-WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
-    //Get the top line data to 'q0'
-    sub  r3, r1, r2
-    vldm r3, {d0, d1}
-
-    //mov  r2, #16
-    mov  r3, #4
-    //Set the top line to the each line of MB(16*16)
-loop_0_get_i16x16_luma_pred_v:
-    vst1.8 {d0,d1}, [r0]!
-    vst1.8 {d0,d1}, [r0]!
-    vst1.8 {d0,d1}, [r0]!
-    vst1.8 {d0,d1}, [r0]!
-    subs  r3, #1
-    bne  loop_0_get_i16x16_luma_pred_v
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
-    //stmdb sp!, {r4, lr}
-    sub  r1, r1, #1
-    mov  r3, #4
-loop_0_get_i16x16_luma_pred_h:
-    //Get one byte data from left side
-    vld1.8 {d0[],d1[]}, [r1], r2
-    vld1.8 {d2[],d3[]}, [r1], r2
-    vld1.8 {d4[],d5[]}, [r1], r2
-    vld1.8 {d6[],d7[]}, [r1], r2
-
-    //Set the line of MB using the left side byte data
-    vst1.8 {d0,d1}, [r0]!
-    //add r0, #16
-    vst1.8 {d2,d3}, [r0]!
-    //add r0, #16
-    vst1.8 {d4,d5}, [r0]!
-    //add r0, #16
-    vst1.8 {d6,d7}, [r0]!
-    //add r0, #16
-
-    subs  r3, #1
-    bne  loop_0_get_i16x16_luma_pred_h
-
-WELS_ASM_FUNC_END
-
-
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
     //stmdb sp!, { r2-r5, lr}
     //Get the left vertical line data
--- a/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
+++ b/codec/encoder/core/arm64/intra_pred_aarch64_neon.S
@@ -349,23 +349,6 @@
 .endr
 WELS_ASM_AARCH64_FUNC_END
 
-//for Luma 16x16
-WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredV_AArch64_neon
-    sub     x3, x1, x2
-    ld1     {v0.16b}, [x3]
-.rept 16
-    st1     {v0.16b}, [x0], 16
-.endr
-WELS_ASM_AARCH64_FUNC_END
-
-WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredH_AArch64_neon
-    sub     x3, x1, #1
-.rept 16
-    ld1r    {v0.16b}, [x3], x2
-    st1     {v0.16b}, [x0], 16
-.endr
-WELS_ASM_AARCH64_FUNC_END
-
 WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDc_AArch64_neon
     sub     x3, x1, x2
     sub     x4, x1, #1
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -74,8 +74,6 @@
 void WelsI16x16ChromaPredVer (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16ChromaPredHor (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
-void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
@@ -94,8 +92,6 @@
 void WelsFillingPred1to16_sse2 (uint8_t* pPred, const uint8_t kuiValue);
 
 //for intra-prediction ASM functions
-void WelsI16x16LumaPredV_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredDc_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredPlane_sse2 (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
@@ -116,8 +112,6 @@
 #endif//X86_ASM
 
 #if defined(HAVE_NEON)
-void WelsI16x16LumaPredV_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredDc_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredPlane_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 
@@ -137,8 +131,6 @@
 #endif//HAVE_NEON
 
 #if defined(HAVE_NEON_AARCH64)
-void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredDc_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -41,6 +41,7 @@
  */
 #include "ls_defines.h"
 #include "cpu_core.h"
+#include "intra_pred_common.h"
 #include "get_intra_predictor.h"
 
 namespace WelsEnc {
@@ -537,37 +538,6 @@
   ST64 (pPred + 56, kuiDcValue64);
 }
 
-
-void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
-  uint8_t i = 15;
-  const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
-  const uint64_t kuiT1 = LD64 (kpSrc);
-  const uint64_t kuiT2 = LD64 (kpSrc + 8);
-  uint8_t* pDst = pPred;
-
-  do {
-    ST64 (pDst  , kuiT1);
-    ST64 (pDst + 8, kuiT2);
-    pDst += 16;
-  } while (i-- > 0);
-}
-
-void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
-  int32_t iStridex15 = (kiStride << 4) - kiStride;
-  int32_t iPredStride = 16;
-  int32_t iPredStridex15 = 240;	//(iPredStride<<4)-iPredStride;
-  uint8_t i = 15;
-
-  do {
-    const uint8_t kuiSrc8	= pRef[iStridex15 - 1];
-    const uint64_t kuiV64	= (uint64_t) (0x0101010101010101ULL * kuiSrc8);
-    ST64 (&pPred[iPredStridex15], kuiV64);
-    ST64 (&pPred[iPredStridex15 + 8], kuiV64);
-
-    iStridex15 -= kiStride;
-    iPredStridex15 -= iPredStride;
-  } while (i-- > 0);
-}
 
 void WelsI16x16LumaPredPlane_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
   int32_t iLTshift = 0, iTopshift = 0, iLeftshift = 0, iTopSum = 0, iLeftSum = 0;
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -40,7 +40,7 @@
 
 #include "sample.h"
 #include "sad_common.h"
-
+#include "intra_pred_common.h"
 #include "mc.h"
 #include "cpu_core.h"
 
@@ -250,8 +250,8 @@
 }
 
 extern void WelsI16x16LumaPredDc_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
-extern void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
-extern void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+//extern void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
+//extern void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t iStride);
 
 int32_t WelsSampleSatdIntra16x16Combined3_c (uint8_t* pDec, int32_t iDecStride, uint8_t* pEnc, int32_t iEncStride,
     int32_t* pBestMode, int32_t iLambda, uint8_t* pDst) {
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -307,74 +307,6 @@
     ret
 
 ;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
-    add r0, 16
-    add r1, r2
-    movzx r3, byte [r1]
-    SSE2_Copy16Times xmm0, r3d
-    movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-    push r3
-    %assign push_num 1
-    LOAD_3_PARA
-    SIGN_EXTENSION r2, r2d
-    dec r1
-    movzx r3, byte [r1]
-    SSE2_Copy16Times xmm0, r3d
-    movdqa [r0], xmm0
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    SSE2_PRED_H_16X16_ONE_LINE
-    pop r3
-    ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-    %assign push_num 0
-    LOAD_3_PARA
-    SIGN_EXTENSION r2, r2d
-    sub     r1, r2
-    movdqa  xmm0, [r1]
-
-    movdqa  [r0], xmm0
-    movdqa  [r0+10h], xmm0
-    movdqa  [r0+20h], xmm0
-    movdqa  [r0+30h], xmm0
-    movdqa  [r0+40h], xmm0
-    movdqa  [r0+50h], xmm0
-    movdqa  [r0+60h], xmm0
-    movdqa  [r0+70h], xmm0
-    movdqa  [r0+80h], xmm0
-    movdqa  [r0+90h], xmm0
-    movdqa  [r0+160], xmm0
-    movdqa  [r0+176], xmm0
-    movdqa  [r0+192], xmm0
-    movdqa  [r0+208], xmm0
-    movdqa  [r0+224], xmm0
-    movdqa  [r0+240], xmm0
-
-    ret
-
-;***********************************************************************
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredPlane_sse2
--- a/codec/processing/build/win32/WelsVP.vcproj
+++ b/codec/processing/build/win32/WelsVP.vcproj
@@ -358,11 +358,11 @@
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
 			>
 			<File
-				RelativePath="..\..\src\common\common.cpp"
+				RelativePath="..\..\..\common\src\cpu.cpp"
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\src\cpu.cpp"
+				RelativePath="..\..\..\common\src\intra_pred_common.cpp"
 				>
 			</File>
 			<File
@@ -426,6 +426,10 @@
 				>
 			</File>
 			<File
+				RelativePath="..\..\..\common\inc\intra_pred_common.h"
+				>
+			</File>
+			<File
 				RelativePath="..\..\src\common\memory.h"
 				>
 			</File>
@@ -569,6 +573,46 @@
 					<Tool
 						Name="VCCustomBuildTool"
 						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\common\x86\intra_pred_com.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|x64"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
--- a/codec/processing/src/common/common.cpp
+++ /dev/null
@@ -1,69 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "common.h"
-#include "ls_defines.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void WelsI16x16LumaPredV_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
-  uint8_t i = 15;
-  const int8_t* kpSrc = (int8_t*)&pRef[-kiStride];
-  const uint64_t kuiT1 = LD64 (kpSrc);
-  const uint64_t kuiT2 = LD64 (kpSrc + 8);
-  uint8_t* pDst = pPred;
-
-  do {
-    ST64 (pDst  , kuiT1);
-    ST64 (pDst + 8, kuiT2);
-    pDst += 16;
-  } while (i-- > 0);
-}
-
-void WelsI16x16LumaPredH_c (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
-  int32_t iStridex15 = (kiStride << 4) - kiStride;
-  int32_t iPredStride = 16;
-  int32_t iPredStridex15 = 240;	//(iPredStride<<4)-iPredStride;
-  uint8_t i = 15;
-
-  do {
-    const uint8_t kuiSrc8	= pRef[iStridex15 - 1];
-    const uint64_t kuiV64	= (uint64_t) (0x0101010101010101ULL * kuiSrc8);
-    ST64 (&pPred[iPredStridex15], kuiV64);
-    ST64 (&pPred[iPredStridex15 + 8], kuiV64);
-
-    iStridex15 -= kiStride;
-    iPredStridex15 -= iPredStride;
-  } while (i-- > 0);
-}
-
-WELSVP_NAMESPACE_END
--- a/codec/processing/src/common/common.h
+++ b/codec/processing/src/common/common.h
@@ -38,8 +38,8 @@
  *
  */
 
-#ifndef WELSVP_SCENECHANGEDETECTIONCOMMON_H
-#define WELSVP_SCENECHANGEDETECTIONCOMMON_H
+#ifndef WELSVP_COMMON_H
+#define WELSVP_COMMON_H
 
 #include "util.h"
 #include "memory.h"
@@ -46,7 +46,17 @@
 #include "WelsFrameWork.h"
 #include "IWelsVP.h"
 #include "sad_common.h"
+#include "intra_pred_common.h"
 
+
+
+typedef void (GetIntraPred) (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+typedef GetIntraPred*  GetIntraPredPtr;
+
+GetIntraPred     WelsI16x16LumaPredV_c;
+GetIntraPred     WelsI16x16LumaPredH_c;
+
 WELSVP_NAMESPACE_BEGIN
 
 typedef  int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
@@ -56,12 +66,6 @@
 typedef int32_t (Sad16x16Func) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
 typedef Sad16x16Func*      PSad16x16Func;
 
-typedef void (GetIntraPred) (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
-
-typedef GetIntraPred*  GetIntraPredPtr;
-
-GetIntraPred     WelsI16x16LumaPredV_c;
-GetIntraPred     WelsI16x16LumaPredH_c;
 
 #ifdef HAVE_NEON
 WELSVP_EXTERN_C_BEGIN
--- a/codec/processing/src/complexityanalysis/ComplexityAnalysis.cpp
+++ b/codec/processing/src/complexityanalysis/ComplexityAnalysis.cpp
@@ -33,10 +33,10 @@
 #include "ComplexityAnalysis.h"
 #include "cpu.h"
 #include "macros.h"
+#include "intra_pred_common.h"
 
 WELSVP_NAMESPACE_BEGIN
 
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
@@ -280,8 +280,29 @@
 #ifdef X86_ASM
   if (iCpuFlag & WELS_CPU_SSE2) {
     m_pSadFunc = WelsSampleSad16x16_sse2;
+    m_pIntraFunc[0] = WelsI16x16LumaPredV_sse2;
+    m_pIntraFunc[1] = WelsI16x16LumaPredH_sse2;
+
   }
 #endif
+
+#if defined (HAVE_NEON)
+  if (iCpuFlag & WELS_CPU_NEON) {
+    m_pSadFunc = WelsSampleSad16x16_neon;
+    m_pIntraFunc[0] = WelsI16x16LumaPredV_neon;
+    m_pIntraFunc[1] = WelsI16x16LumaPredH_neon;
+
+  }
+#endif
+
+#if defined (HAVE_NEON_AARCH64)
+  if (iCpuFlag & WELS_CPU_NEON) {
+    m_pSadFunc = WelsSampleSad16x16_AArch64_neon;
+    m_pIntraFunc[0] =  WelsI16x16LumaPredV_AArch64_neon;
+    m_pIntraFunc[1] = WelsI16x16LumaPredH_AArch64_neon;
+  }
+#endif
+
 }
 
 CComplexityAnalysisScreen::~CComplexityAnalysisScreen() {
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -2,7 +2,6 @@
 PROCESSING_CPP_SRCS=\
 	$(PROCESSING_SRCDIR)/src/adaptivequantization/AdaptiveQuantization.cpp\
 	$(PROCESSING_SRCDIR)/src/backgrounddetection/BackgroundDetection.cpp\
-	$(PROCESSING_SRCDIR)/src/common/common.cpp\
 	$(PROCESSING_SRCDIR)/src/common/memory.cpp\
 	$(PROCESSING_SRCDIR)/src/common/WelsFrameWork.cpp\
 	$(PROCESSING_SRCDIR)/src/common/WelsFrameWorkEx.cpp\